=== This is the CP2K Performance-Test ===


Already up to date.
Current branch master is up to date.


Already up to date.
Current branch master is up to date.

 GIT Revision: 7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3


################# ARCHITECTURE FILE ##################
#!/bin/bash
#
# CP2K arch file for Cray-XC50 (Piz Daint, CSCS, GPU partition)
#
# Tested with: GNU 9.3.0, Cray-MPICH 7.7.18, Cray-libsci 20.09.1, Cray-FFTW 3.3.8.10,
#              COSMA 2.6.2, ELPA 2022.11.001.rc2, LIBINT 2.6.0, LIBPEXSI 1.2.0,
#              LIBXC 6.0.0, LIBVORI 220621, LIBXSMM 1.17, PLUMED 2.8.0,
#              SIRIUS 7.3.2, SPGLIB 1.16.2
#
# Usage: Source this arch file and then run make as instructed.
#        A full toolchain installation is performed as default.
#        Replace or adapt the "module add" commands below if needed.
#
# Author: Matthias Krack (13.12.2022)
#
# \
   if [ "${0}" = "${BASH_SOURCE}" ]; then \
      echo "ERROR: Script ${0##*/} must be sourced"; \
      echo "Usage: source ${0##*/}"; \
      exit 1; \
   fi; \
   this_file=${BASH_SOURCE##*/}; \
   if [ -n "${1}" ]; then \
      gcc_version="${1}"; \
   else \
      gcc_version="9.3.0"; \
   fi; \
   module add daint-gpu; \
   module rm PrgEnv-cray; \
   module add PrgEnv-gnu; \
   module rm gcc; \
   module add gcc/${gcc_version}; \
   module add cray-fftw/3.3.8.10; \
   module add cudatoolkit; \
   echo "Expected setup:"; \
   echo "   cray-mpich/7.7.18"; \
   echo "   craype-haswell"; \
   echo "   daint-gpu/21.09"; \
   echo "   craype/2.7.10"; \
   echo "   cray-libsci/20.09.1"; \
   echo "   PrgEnv-gnu/6.0.10"; \
   echo "   gcc/${gcc_version}"; \
   echo "   cray-fftw/3.3.8.10"; \
   echo "   cudatoolkit/11.0.2_3.38-8.1__g5b73779"; \
   module list; \
   module -f save cp2k_gpu_gnu_psmp; \
   echo "To load the required modules in your batch job script, use:"; \
   echo "   module restore cp2k_gpu_gnu_psmp"; \
   cd tools/toolchain; \
   ./install_cp2k_toolchain.sh --enable-cuda=yes --gpu-ver=P100 --no-arch-files --with-gcc=system --with-libvdwxc --with-pexsi --with-plumed; \
   cd ../..; \
   printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \
   source ${PWD}/tools/toolchain/install/setup; \
   printf "done\n"; \
   echo "Check the output above for error messages and consistency!"; \
   echo "If everything is OK, you can build a CP2K production binary with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \
   echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \
   echo "or build CP2K as a library with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \
   return

# Set options
DO_CHECKS      := no
USE_ACC        := yes
USE_COSMA      := 2.6.2
USE_ELPA       := 2022.11.001.rc2
USE_LIBINT     := 2.6.0
USE_LIBPEXSI   := 1.2.0
USE_LIBVORI    := 220621
USE_LIBXC      := 6.0.0
USE_LIBXSMM    := 1.17
USE_PLUMED     := 2.8.0
#USE_QUIP       := 0.9.10
USE_SIRIUS     := 7.3.2
USE_SPGLIB     := 1.16.2
# Only needed for SIRIUS
LIBVDWXC_VER   := 0.4.0
SPFFT_VER      := 1.0.6
SPLA_VER       := 1.5.4
HDF5_VER       := 1.12.0
# Only needed for LIBPEXSI
SCOTCH_VER     := 6.0.0
SUPERLU_VER    := 6.1.0

LMAX           := 5
MAX_CONTR      := 4

GPUVER         := P100
OFFLOAD_TARGET := cuda

CC             := cc
CXX            := CC
OFFLOAD_CC     := nvcc
FC             := ftn
LD             := ftn
AR             := ar -r

# cc, CC, and ftn include already the proper -march flag
CFLAGS         := -O2 -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g

DFLAGS         := -D__parallel
DFLAGS         += -D__SCALAPACK
DFLAGS         += -D__FFTW3
DFLAGS         += -D__MAX_CONTR=$(strip $(MAX_CONTR))

INSTALL_PATH   := $(PWD)/tools/toolchain/install

ifeq ($(DO_CHECKS), yes)
   DFLAGS         += -D__CHECK_DIAG
endif

ifeq ($(USE_ACC), yes)
   DFLAGS         += -D__DBCSR_ACC
   DFLAGS         += -D__OFFLOAD_CUDA
# Possibly no performance gain with PW_CUDA currently
   DFLAGS         += -D__NO_OFFLOAD_PW
endif

ifneq ($(USE_PLUMED),)
   USE_PLUMED     := $(strip $(USE_PLUMED))
   PLUMED_LIB     := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib
   DFLAGS         += -D__PLUMED2
   USE_GSL        := 2.7
   LIBS           += $(PLUMED_LIB)/libplumed.a
endif

ifneq ($(USE_ELPA),)
   USE_ELPA       := $(strip $(USE_ELPA))
   TARGET         := nvidia
   ELPA_INC       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa-$(USE_ELPA)
   ELPA_LIB       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib
   CFLAGS         += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules
   DFLAGS         += -D__ELPA
   ifeq ($(TARGET), nvidia)
      DFLAGS         += -D__ELPA_NVIDIA_GPU
   endif
   LIBS           += $(ELPA_LIB)/libelpa.a
endif

ifneq ($(USE_QUIP),)
   USE_QUIP       := $(strip $(USE_QUIP))
   QUIP_INC       := $(INSTALL_PATH)/quip-$(USE_QUIP)/include
   QUIP_LIB       := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib
   CFLAGS         += -I$(QUIP_INC)
   DFLAGS         += -D__QUIP
   LIBS           += $(QUIP_LIB)/libquip_core.a
   LIBS           += $(QUIP_LIB)/libatoms.a
   LIBS           += $(QUIP_LIB)/libFoX_sax.a
   LIBS           += $(QUIP_LIB)/libFoX_common.a
   LIBS           += $(QUIP_LIB)/libFoX_utils.a
   LIBS           += $(QUIP_LIB)/libFoX_fsys.a
endif

ifneq ($(USE_LIBPEXSI),)
   USE_LIBPEXSI   := $(strip $(USE_LIBPEXSI))
   SCOTCH_VER     := $(strip $(SCOTCH_VER))
   SUPERLU_VER    := $(strip $(SUPERLU_VER))
   LIBPEXSI_INC   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include
   LIBPEXSI_LIB   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib
   SCOTCH_INC     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include
   SCOTCH_LIB     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib
   SUPERLU_INC    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include
   SUPERLU_LIB    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib
   CFLAGS         += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC)
   DFLAGS         += -D__LIBPEXSI
   LIBS           += $(LIBPEXSI_LIB)/libpexsi.a
   LIBS           += $(SUPERLU_LIB)/libsuperlu_dist.a
   LIBS           += $(SCOTCH_LIB)/libptscotchparmetis.a
   LIBS           += $(SCOTCH_LIB)/libptscotch.a
   LIBS           += $(SCOTCH_LIB)/libptscotcherr.a
   LIBS           += $(SCOTCH_LIB)/libscotchmetis.a
   LIBS           += $(SCOTCH_LIB)/libscotch.a
endif

ifneq ($(USE_LIBVORI),)
   USE_LIBVORI    := $(strip $(USE_LIBVORI))
   LIBVORI_LIB    := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib
   DFLAGS         += -D__LIBVORI
   LIBS           += $(LIBVORI_LIB)/libvori.a
endif

ifneq ($(USE_LIBXC),)
   USE_LIBXC      := $(strip $(USE_LIBXC))
   LIBXC_INC      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include
   LIBXC_LIB      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib
   CFLAGS         += -I$(LIBXC_INC)
   DFLAGS         += -D__LIBXC
   LIBS           += $(LIBXC_LIB)/libxcf03.a
   LIBS           += $(LIBXC_LIB)/libxc.a
endif

ifneq ($(USE_LIBINT),)
   USE_LIBINT     := $(strip $(USE_LIBINT))
   LMAX           := $(strip $(LMAX))
   LIBINT_INC     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include
   LIBINT_LIB     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib
   CFLAGS         += -I$(LIBINT_INC)
   DFLAGS         += -D__LIBINT
   LIBS           += $(LIBINT_LIB)/libint2.a
endif

ifneq ($(USE_SPGLIB),)
   USE_SPGLIB     := $(strip $(USE_SPGLIB))
   SPGLIB_INC     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include
   SPGLIB_LIB     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib
   CFLAGS         += -I$(SPGLIB_INC)
   DFLAGS         += -D__SPGLIB
   LIBS           += $(SPGLIB_LIB)/libsymspg.a
endif

ifneq ($(USE_LIBXSMM),)
   USE_LIBXSMM    := $(strip $(USE_LIBXSMM))
   LIBXSMM_INC    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include
   LIBXSMM_LIB    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib
   CFLAGS         += -I$(LIBXSMM_INC)
   DFLAGS         += -D__LIBXSMM
   LIBS           += $(LIBXSMM_LIB)/libxsmmf.a
   LIBS           += $(LIBXSMM_LIB)/libxsmm.a
endif

ifneq ($(USE_SIRIUS),)
   USE_SIRIUS     := $(strip $(USE_SIRIUS))
   HDF5_VER       := $(strip $(HDF5_VER))
   HDF5_LIB       := $(INSTALL_PATH)/hdf5-$(HDF5_VER)/lib
   LIBVDWXC_VER   := $(strip $(LIBVDWXC_VER))
   LIBVDWXC_INC   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include
   LIBVDWXC_LIB   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib
   SPFFT_VER      := $(strip $(SPFFT_VER))
   SPFFT_INC      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/include
   SPLA_VER       := $(strip $(SPLA_VER))
   SPLA_INC       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/include/spla
   ifeq ($(USE_ACC), yes)
      DFLAGS         += -D__OFFLOAD_GEMM
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib/cuda
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib/cuda
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include/cuda
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib/cuda
   else
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib
   endif
   CFLAGS         += -I$(LIBVDWXC_INC)
   CFLAGS         += -I$(SPFFT_INC)
   CFLAGS         += -I$(SPLA_INC)
   CFLAGS         += -I$(SIRIUS_INC)
   DFLAGS         += -D__HDF5
   DFLAGS         += -D__LIBVDWXC
   DFLAGS         += -D__SPFFT
   DFLAGS         += -D__SPLA
   DFLAGS         += -D__SIRIUS
   LIBS           += $(SIRIUS_LIB)/libsirius.a
   LIBS           += $(SPLA_LIB)/libspla.a
   LIBS           += $(SPFFT_LIB)/libspfft.a
   LIBS           += $(LIBVDWXC_LIB)/libvdwxc.a
   LIBS           += $(HDF5_LIB)/libhdf5.a
endif

ifneq ($(USE_COSMA),)
   USE_COSMA      := $(strip $(USE_COSMA))
   ifeq ($(USE_ACC), yes)
      USE_COSMA      := $(USE_COSMA)-cuda
   endif
   COSMA_INC      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include
   COSMA_LIB      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib
   CFLAGS         += -I$(COSMA_INC)
   DFLAGS         += -D__COSMA
   LIBS           += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a
   LIBS           += $(COSMA_LIB)/libcosma.a
   LIBS           += $(COSMA_LIB)/libcosta_prefixed_scalapack.a
   LIBS           += $(COSMA_LIB)/libcosta.a
   LIBS           += $(COSMA_LIB)/libTiled-MM.a
endif

ifneq ($(USE_GSL),)
   USE_GSL        := $(strip $(USE_GSL))
   GSL_INC        := $(INSTALL_PATH)/gsl-$(USE_GSL)/include
   GSL_LIB        := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib
   CFLAGS         += -I$(GSL_INC)
   DFLAGS         += -D__GSL
   LIBS           += $(GSL_LIB)/libgsl.a
endif

CFLAGS         += $(DFLAGS)

CXXFLAGS       := $(CFLAGS) -std=c++11

OFFLOAD_FLAGS  := $(DFLAGS) -O3 -Xcompiler="-fopenmp" -arch sm_60 --std=c++11

FCFLAGS        := $(CFLAGS)
ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes)
   FCFLAGS        += -fallow-argument-mismatch
endif
FCFLAGS        += -fbacktrace
FCFLAGS        += -ffree-form
FCFLAGS        += -ffree-line-length-none
FCFLAGS        += -fno-omit-frame-pointer
FCFLAGS        += -std=f2008

ifneq ($(CUDA_HOME),)
   CUDA_LIB       := $(CUDA_HOME)/lib64
   LDFLAGS        := $(FCFLAGS) -L$(CUDA_LIB) -Wl,-rpath=$(CUDA_LIB)
else
   LDFLAGS        := $(FCFLAGS)
endif

LIBS           += -lcusolver -lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt
LIBS           += -lz -ldl -lpthread -lstdc++

# End
############### END ARCHITECTURE FILE ################


===== TESTS (description) =====
 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-RPA.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-dRPA-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/01
 job id: 43511705
 --- Point ---
 name: 10
 plot: h2o_32_ri_rpa_mp2
 regex: Total RI-RPA Time= 
 label: RI-RPA (8n/2r/6t)
 --- Point ---
 name: 11
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-RPA (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-MP2.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-HF-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-MP2-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/02
 job id: 43511706
 --- Point ---
 name: 20
 plot: h2o_32_ri_rpa_mp2
 regex: Total MP2 Time= 
 label: RI-MP2 (8n/6r/2t)
 --- Point ---
 name: 21
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-MP2 (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/03
 job id: 43511707
 --- Point ---
 name: 100
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 101
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/04
 job id: 43511708
 --- Point ---
 name: 102
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 103
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/05
 job id: 43511709
 --- Point ---
 name: 104
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 105
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/06
 job id: 43511710
 --- Point ---
 name: 106
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 107
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/07
 job id: 43511713
 --- Point ---
 name: 108
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 109
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/08
 job id: 43511715
 --- Point ---
 name: 110
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 111
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/09
 job id: 43511716
 --- Point ---
 name: 200
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 201
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/10
 job id: 43511718
 --- Point ---
 name: 202
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 203
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/11
 job id: 43511721
 --- Point ---
 name: 204
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 205
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/12
 job id: 43511723
 --- Point ---
 name: 206
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 207
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/13
 job id: 43511725
 --- Point ---
 name: 208
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 209
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/14
 job id: 43511728
 --- Point ---
 name: 210
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 211
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/15
 job id: 43511730
 --- Point ---
 name: 400
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 401
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/16
 job id: 43511731
 --- Point ---
 name: 402
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 403
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/17
 job id: 43511745
 --- Point ---
 name: 404
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 405
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/18
 job id: 43511746
 --- Point ---
 name: 406
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 407
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/19
 job id: 43511747
 --- Point ---
 name: 408
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 409
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/20
 job id: 43511748
 --- Point ---
 name: 410
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 411
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/21
 job id: 43511749
 --- Point ---
 name: 500
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 501
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/22
 job id: 43511750
 --- Point ---
 name: 502
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 503
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/23
 job id: 43511751
 --- Point ---
 name: 504
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 505
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/24
 job id: 43511752
 --- Point ---
 name: 506
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 507
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/25
 job id: 43511753
 --- Point ---
 name: 508
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 509
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/26
 job id: 43511754
 --- Point ---
 name: 510
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 511
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

=== END TESTS (description) ===


===== PLOTS (description) =====
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2_mem", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md_mem", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md_mem", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md_mem", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls_mem", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
=== END PLOTS (description) ===


============ RESULTS ============
 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/01/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               15                 177869.
 MP_Allreduce          344                      9.
 MP_Sync                 3
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.088    0.089  135.538  135.538
 farming_run                          1  2.0  134.613  134.615  135.402  135.420
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32              4194304       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            154140672       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            159645696       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            208732160       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            212860928       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            212860928       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            227352576       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         896801644032       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         928925089792       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         928925089792       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         962100985856       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693169221632       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753639550976       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.164741E+12       0.0%      0.0%    100.0%
 flops max/rank                    447.801317E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249492158       0.0%      0.0%    100.0%
 number of processed stacks                164328       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1518.3
 marketing flops                     7.165779E+12
 -------------------------------------------------------------------------------
 # multiplications                           1160
 max memory usage/rank               1.463718E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                    2592
 MPI messages size (bytes):
  total size                         1.140326E+09
  min size                           0.000000E+00
  max size                           1.663488E+06
  average size                     439.940750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 132                        0
       128 < size <=     8192                 348                  2850816
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1536                179306496
    131072 < size <=  4194304                 576                958169088
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         2308                     54.
 MP_Alltoall          4670                 822215.
 MP_ISend             2604                  90577.
 MP_IRecv             2604                  90574.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              228                1113141.
 MP_Allreduce          485                2282278.
 MP_Sync                27
 MP_Alltoall            38                9316958.
 MP_SendRecv           120                 384007.
 MP_ISendRecv           45                 235435.
 MP_Wait               191
 MP_comm_split           8
 MP_ISend              127                3867574.
 MP_IRecv              127                3866554.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.010    0.024  115.854  115.855
 qs_energies                          1  2.0    0.000    0.000  115.571  115.572
 mp2_main                             1  3.0    0.000    0.000  113.291  113.293
 mp2_gpw_main                         1  4.0    0.030    0.037  112.307  112.309
 mp2_ri_gpw_compute_in                1  5.0    0.188    0.206   93.386   93.571
 mp2_ri_gpw_compute_in_loop           1  6.0    0.004    0.005   55.350   55.535
 mp2_eri_3c_integrate_gpw           272  7.0    0.153    0.169   41.647   46.693
 get_2c_integrals                     1  6.0    0.001    0.014   37.585   37.848
 integrate_v_rspace                 273  8.0    0.435    0.455   25.043   29.716
 pw_transfer                       6555 10.6    0.377    0.395   27.430   28.095
 fft_wrap_pw1pw2                   5465 11.4    0.045    0.048   26.072   26.582
 grid_integrate_task_list           273  9.0   20.867   25.976   20.867   25.976
 fft_wrap_pw1pw2_100               2178 12.4    1.228    1.473   23.633   24.165
 compute_2c_integrals                 1  7.0    0.013    0.014   19.890   19.890
 compute_2c_integrals_loop_lm         1  8.0    0.024    0.024   19.029   19.525
 mp2_eri_2c_integrate_gpw             1  9.0    2.374    2.447   19.005   19.501
 rpa_ri_compute_en                    1  5.0    0.011    0.013   18.804   18.881
 cp_fm_cholesky_decompose            12  8.2   17.769   18.047   17.769   18.047
 cholesky_decomp                      1  7.0    0.000    0.000   16.537   16.810
 fft3d_s                           5443 13.4   16.163   16.446   16.185   16.468
 ao_to_mo_and_store_B_mult_1        272  7.0   10.860   15.583   10.860   15.583
 calculate_wavefunction             272  8.0    5.402    5.472   12.525   13.146
 rpa_num_int                          1  6.0    0.000    0.003   10.800   10.810
 rpa_num_int_RPA_matrix_operati       8  7.0    0.000    0.000   10.660   10.713
 calc_mat_Q                           8  8.0    0.000    0.000    9.389    9.510
 contract_S_to_Q                      8  9.0    0.000    0.000    8.811    8.929
 calc_potential_gpw                 544  9.5    0.005    0.006    8.302    8.680
 mp2_eri_2c_integrate_gpw_pot_l     272 10.0    0.001    0.002    8.286    8.626
 parallel_gemm_fm                    14  9.1    0.000    0.000    8.390    8.511
 parallel_gemm_fm_cosma              14 10.1    8.390    8.511    8.390    8.511
 potential_pw2rs                    545 10.0    0.106    0.108    7.700    8.394
 collocate_single_gaussian          272 10.0    0.040    0.043    7.518    7.766
 create_integ_mat                     1  6.0    0.014    0.027    7.711    7.720
 array2fm                             1  7.0    0.000    0.000    6.719    7.145
 pw_scatter_s                      2720 13.7    4.439    4.569    4.439    4.569
 pw_gather_s                       2722 13.2    3.873    4.235    3.873    4.235
 array2fm_buffer_send                 1  8.0    3.009    3.181    3.009    3.181
 pw_poisson_solve                   545 10.5    1.123    1.172    2.186    2.393
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="10", plot="h2o_32_ri_rpa_mp2", label="RI-RPA (8n/2r/6t)", y=112.308869, yerr=0.000000
PlotPoint: name="11", plot="h2o_32_ri_rpa_mp2_mem", label="RI-RPA (8n/2r/6t)", y=2735.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/02/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               22                 205321.
 MP_Allreduce          344                     10.
 MP_Sync                 4
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.059    0.083  404.395  404.402
 farming_run                          1  2.0  402.896  402.906  404.266  404.311
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32             16777216       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            565182464       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            585367552       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            626196480       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            638582784       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            638582784       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            682057728       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         897827141120       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         929989394432       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         929989394432       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         963203301376       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693481172992       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753962643456       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.172206E+12       0.0%      0.0%    100.0%
 flops max/rank                    150.696064E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249788822       0.0%      0.0%    100.0%
 number of processed stacks                 98736       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    2529.9
 marketing flops                     7.174951E+12
 -------------------------------------------------------------------------------
 # multiplications                           1140
 max memory usage/rank               1.219482E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   61440
 MPI messages size (bytes):
  total size                         6.073508E+09
  min size                           0.000000E+00
  max size                         642.960000E+03
  average size                      98.852664E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               32004                        0
       128 < size <=     8192                1820                 14909440
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072               18640               1081442304
    131072 < size <=  4194304                8976               4977156096
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         1003                     44.
 MP_Alltoall          1797                 713538.
 MP_ISend             3686                  54943.
 MP_IRecv             3622                  54292.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              703                 408373.
 MP_Allreduce         1821                  23730.
 MP_Sync                38
 MP_Alltoall            77                5929977.
 MP_SendRecv          2876                2171486.
 MP_ISendRecv         1034                 172620.
 MP_Wait              1346
 MP_comm_split           7
 MP_ISend              264                 362227.
 MP_IRecv              264                 362718.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.014    0.050  211.764  211.765
 qs_energies                          1  2.0    0.000    0.000  211.514  211.529
 scf_env_do_scf                       1  3.0    0.000    0.000  107.257  107.257
 qs_ks_update_qs_env                  5  5.0    0.000    0.000  105.784  105.795
 rebuild_ks_matrix                    4  6.0    0.000    0.000  105.783  105.793
 qs_ks_build_kohn_sham_matrix         4  7.0    0.056    0.067  105.783  105.793
 hfx_ks_matrix                        4  8.0    0.001    0.001  105.388  105.391
 integrate_four_center                4  9.0    0.143    0.452  105.387  105.390
 mp2_main                             1  3.0    0.003    0.026  103.966  103.981
 mp2_gpw_main                         1  4.0    0.038    0.074  102.461  102.481
 integrate_four_center_main           4 10.0    0.106    0.502   96.740   99.396
 integrate_four_center_bin          262 11.0   96.634   99.357   96.634   99.357
 init_scf_loop                        1  4.0    0.000    0.000   92.478   92.478
 mp2_ri_gpw_compute_in                1  5.0    0.070    0.113   74.873   76.001
 mp2_ri_gpw_compute_in_loop           1  6.0    0.003    0.039   54.545   55.670
 mp2_eri_3c_integrate_gpw            91  7.0    0.145    0.163   42.261   47.172
 integrate_v_rspace                  95  8.0    0.397    0.563   28.641   33.344
 pw_transfer                       2240 10.6    0.146    0.162   29.938   30.465
 fft_wrap_pw1pw2                   1868 11.4    0.018    0.020   28.955   29.516
 mp2_ri_gpw_compute_en                1  5.0    0.086    0.115   27.381   29.156
 grid_integrate_task_list            95  9.0   23.910   28.834   23.910   28.834
 fft_wrap_pw1pw2_100                730 12.4    1.263    1.416   26.647   27.059
 ao_to_mo_and_store_B_mult_1         91  7.0   10.600   25.868   10.600   25.868
 mp2_ri_gpw_compute_en_RI_loop        1  6.0    1.856    1.910   25.539   25.550
 get_2c_integrals                     1  6.0    0.001    0.009   20.208   20.265
 compute_2c_integrals                 1  7.0    0.011    0.041   19.187   19.197
 compute_2c_integrals_loop_lm         1  8.0    0.003    0.007   18.848   19.033
 mp2_eri_2c_integrate_gpw             1  9.0    1.738    1.875   18.845   19.032
 fft3d_s                           1823 13.4   18.410   18.831   18.423   18.844
 scf_env_do_scf_inner_loop            4  4.0    0.000    0.000   14.775   14.775
 calculate_wavefunction              91  8.0    2.028    2.061    9.751    9.983
 potential_pw2rs                    186 10.0    0.033    0.035    8.633    9.231
 mp2_ri_gpw_compute_en_expansio     172  7.0    0.554    0.575    8.758    9.078
 mp2_ri_gpw_compute_en_comm          22  7.0    0.493    0.507    8.532    8.831
 mp2_eri_2c_integrate_gpw_pot_l      91 10.0    0.001    0.001    8.227    8.620
 local_gemm                         172  8.0    8.204    8.514    8.204    8.514
 calc_potential_gpw                 182  9.5    0.002    0.002    7.922    8.164
 collocate_single_gaussian           91 10.0    0.017    0.021    7.873    8.126
 mp_sendrecv_dm3                   2068  8.0    6.590    6.881    6.590    6.881
 mp2_ri_gpw_compute_en_ener         172  7.0    6.348    6.448    6.348    6.448
 pw_gather_s                        912 13.2    4.928    5.431    4.928    5.431
 mp_sync                             38 10.4    3.306    5.356    3.306    5.356
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="20", plot="h2o_32_ri_rpa_mp2", label="RI-MP2 (8n/6r/2t)", y=102.436254, yerr=0.000000
PlotPoint: name="21", plot="h2o_32_ri_rpa_mp2_mem", label="RI-MP2 (8n/6r/2t)", y=1506.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/03/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     29.277748E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               5055360       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      29.1
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             451.588096E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 9436608
 MPI messages size (bytes):
  total size                       333.233553E+09
  min size                           0.000000E+00
  max size                         315.840000E+03
  average size                      35.312852E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             4913240                        0
       128 < size <=     8192             1155432               9465298944
      8192 < size <=    32768             1984512              54190407680
     32768 < size <=   131072              551296              42776657920
    131072 < size <=  4194304              832128             226802306368
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3683                  62385.
 MP_Allreduce        10249                    271.
 MP_Sync               580
 MP_Alltoall          2083                 465423.
 MP_SendRecv         22610                   5520.
 MP_ISendRecv        22610                   5520.
 MP_Wait             37876
 MP_comm_split          50
 MP_ISend            20771                  42672.
 MP_IRecv            20771                  42672.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.093    0.200   59.120   59.122
 qs_mol_dyn_low                       1  2.0    0.003    0.005   58.551   58.558
 qs_forces                           11  3.9    0.002    0.004   58.487   58.488
 qs_energies                         11  4.9    0.002    0.009   56.820   56.838
 scf_env_do_scf                      11  5.9    0.002    0.011   49.442   49.442
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.011   46.726   46.727
 dbcsr_multiply_generic            2286 12.5    0.094    0.102   34.950   35.403
 qs_scf_new_mos                     108  7.5    0.000    0.001   34.456   34.707
 qs_scf_loop_do_ot                  108  8.5    0.000    0.001   34.456   34.706
 ot_scf_mini                        108  9.5    0.002    0.003   32.714   32.892
 velocity_verlet                     10  3.0    0.002    0.002   28.851   28.852
 multiply_cannon                   2286 13.5    0.194    0.207   26.179   27.707
 multiply_cannon_loop              2286 14.5    1.489    1.564   25.070   26.588
 ot_mini                            108 10.5    0.001    0.001   20.335   20.566
 qs_ot_get_derivative               108 11.5    0.001    0.001   17.318   17.501
 mp_waitall_1                    245248 16.5    9.226   15.165    9.226   15.165
 multiply_cannon_metrocomm3       54864 15.5    0.066    0.072    5.818   13.152
 multiply_cannon_multrec          54864 15.5    4.191    6.558    7.459   10.788
 rebuild_ks_matrix                  119  8.3    0.000    0.000    9.137    9.251
 qs_ks_build_kohn_sham_matrix       119  9.3    0.010    0.013    9.136    9.251
 qs_ks_update_qs_env                119  7.6    0.001    0.001    8.035    8.142
 qs_ot_get_p                        119 10.4    0.003    0.018    7.642    7.928
 mp_sum_l                          7207 12.9    5.649    7.334    5.649    7.334
 multiply_cannon_sync_h2d         54864 15.5    5.789    7.118    5.789    7.118
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    5.778    6.288
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    5.994    6.097
 qs_rho_update_rho_low              119  7.7    0.000    0.001    5.718    5.833
 calculate_rho_elec                 119  8.7    0.011    0.016    5.718    5.832
 init_scf_run                        11  5.9    0.000    0.001    5.808    5.809
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    5.808    5.809
 sum_up_and_integrate               119 10.3    0.012    0.015    5.461    5.480
 integrate_v_rspace                 119 11.3    0.002    0.002    5.449    5.469
 dbcsr_mm_accdrv_process          76910 16.1    1.168    1.781    3.190    4.459
 rs_pw_transfer                     974 11.9    0.011    0.012    4.179    4.331
 qs_ot_p2m_diag                      50 11.0    0.004    0.007    4.123    4.174
 density_rs2pw                      119  9.7    0.004    0.004    3.591    3.736
 multiply_cannon_metrocomm1       54864 15.5    0.051    0.056    2.020    3.390
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    3.041    3.241
 apply_single                       119 13.6    0.000    0.000    3.041    3.241
 cp_dbcsr_syevd                      50 12.0    0.002    0.003    3.225    3.226
 calculate_dm_sparse                119  9.5    0.000    0.001    3.011    3.130
 make_m2s                          4572 13.5    0.055    0.057    2.875    2.949
 wfi_extrapolate                     11  7.9    0.001    0.001    2.896    2.896
 make_images                       4572 14.5    0.134    0.140    2.790    2.863
 pw_transfer                       1439 11.6    0.051    0.056    2.712    2.838
 calculate_first_density_matrix       1  7.0    0.000    0.003    2.801    2.804
 fft_wrap_pw1pw2                   1201 12.6    0.006    0.007    2.637    2.765
 ot_diis_step                       108 11.5    0.006    0.006    2.750    2.752
 potential_pw2rs                    119 12.3    0.004    0.004    2.697    2.742
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    2.723    2.725
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.707    2.710
 cp_fm_redistribute_end              50 14.0    2.461    2.678    2.468    2.682
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    2.584    2.654
 init_scf_loop                       11  6.9    0.001    0.004    2.636    2.640
 jit_kernel_multiply                 13 15.8    1.959    2.619    1.959    2.619
 cp_fm_diag_elpa_base                50 14.0    0.211    2.547    0.212    2.556
 fft3d_ps                          1201 14.6    0.362    0.467    2.420    2.544
 mp_alltoall_d11v                  2130 13.8    2.145    2.306    2.145    2.306
 acc_transpose_blocks             54864 15.5    0.230    0.250    1.802    2.288
 mp_sum_d                          4125 12.0    1.575    2.234    1.575    2.234
 fft_wrap_pw1pw2_140                487 13.2    0.076    0.091    2.052    2.180
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.121    2.175
 grid_integrate_task_list           119 12.3    2.007    2.117    2.007    2.117
 make_images_sizes                 4572 15.5    0.004    0.005    1.446    1.764
 mp_alltoall_i44                   4572 16.5    1.442    1.760    1.442    1.760
 mp_waitany                       12084 13.8    1.431    1.612    1.431    1.612
 mp_alltoall_z22v                  1201 16.6    1.507    1.606    1.507    1.606
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.452    1.473
 grid_collocate_task_list           119  9.7    1.290    1.374    1.290    1.374
 arnoldi_extremal                   119 11.4    0.002    0.009    1.153    1.287
 arnoldi_normal_ev                  119 12.4    0.006    0.035    1.151    1.286
 parallel_gemm_fm                    81  9.0    0.000    0.000    1.260    1.265
 parallel_gemm_fm_cosma              81 10.0    1.260    1.265    1.260    1.265
 prepare_preconditioner              11  7.9    0.000    0.000    1.169    1.197
 make_preconditioner                 11  8.9    0.000    0.002    1.169    1.197
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="100", plot="h2o_64_md", label="(8n/12r/1t)", y=59.122000, yerr=0.000000
PlotPoint: name="101", plot="h2o_64_md_mem", label="(8n/12r/1t)", y=430.454545, yerr=0.782030
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/04/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     57.173320E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3066240       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      47.9
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             487.583744E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2194560
 MPI messages size (bytes):
  total size                       310.646604E+09
  min size                           0.000000E+00
  max size                           1.145520E+06
  average size                     141.553031E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              724648                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              281952               4619501568
     32768 < size <=   131072              494448              39143342080
    131072 < size <=  4194304              440000             264807943488
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62664.
 MP_Allreduce        10226                    305.
 MP_Sync               104
 MP_Alltoall          2060                1063860.
 MP_SendRecv         16779                  37093.
 MP_ISendRecv        16779                  37093.
 MP_Wait             23539
 MP_comm_split          50
 MP_ISend             5720                 128509.
 MP_IRecv             5720                 128509.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.027    0.039   43.095   43.097
 qs_mol_dyn_low                       1  2.0    0.003    0.004   42.756   42.764
 qs_forces                           11  3.9    0.002    0.004   42.689   42.690
 qs_energies                         11  4.9    0.002    0.008   40.860   40.864
 scf_env_do_scf                      11  5.9    0.003    0.023   34.940   34.941
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   31.978   31.979
 dbcsr_multiply_generic            2286 12.5    0.100    0.103   23.081   23.538
 qs_scf_new_mos                     108  7.5    0.001    0.001   22.171   22.411
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   22.171   22.411
 ot_scf_mini                        108  9.5    0.003    0.004   21.182   21.358
 velocity_verlet                     10  3.0    0.002    0.002   20.149   20.150
 multiply_cannon                   2286 13.5    0.211    0.219   17.069   19.017
 multiply_cannon_loop              2286 14.5    0.900    0.970   15.690   17.552
 ot_mini                            108 10.5    0.001    0.001   12.902   13.150
 mp_waitall_1                    200699 16.5    6.639   11.855    6.639   11.855
 qs_ot_get_derivative               108 11.5    0.001    0.001   10.255   10.431
 multiply_cannon_metrocomm3       27432 15.5    0.068    0.070    4.586    9.879
 multiply_cannon_multrec          27432 15.5    1.945    4.350    5.950    8.740
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.654    7.849
 qs_ks_build_kohn_sham_matrix       119  9.3    0.026    0.071    7.653    7.849
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.728    6.904
 dbcsr_mm_accdrv_process          47894 16.0    3.087    5.380    3.935    5.870
 qs_ot_get_p                        119 10.4    0.001    0.001    5.146    5.371
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    3.860    4.758
 sum_up_and_integrate               119 10.3    0.024    0.027    4.591    4.596
 integrate_v_rspace                 119 11.3    0.002    0.002    4.566    4.574
 mp_sum_l                          7207 12.9    2.483    4.464    2.483    4.464
 init_scf_run                        11  5.9    0.000    0.001    4.449    4.450
 scf_env_initial_rho_setup           11  6.9    0.002    0.004    4.449    4.450
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    3.208    4.269
 apply_single                       119 13.6    0.000    0.000    3.208    4.269
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.131    4.167
 calculate_rho_elec                 119  8.7    0.021    0.024    4.131    4.167
 make_m2s                          4572 13.5    0.054    0.056    3.091    3.388
 qs_ot_p2m_diag                      50 11.0    0.009    0.013    3.323    3.344
 rs_pw_transfer                     974 11.9    0.010    0.011    3.261    3.340
 make_images                       4572 14.5    0.201    0.238    3.001    3.295
 init_scf_loop                       11  6.9    0.001    0.006    2.908    2.908
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.838    2.838
 calculate_first_density_matrix       1  7.0    0.001    0.004    2.759    2.762
 multiply_cannon_sync_h2d         27432 15.5    2.134    2.735    2.134    2.735
 ot_diis_step                       108 11.5    0.010    0.011    2.578    2.579
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.481    2.569
 density_rs2pw                      119  9.7    0.004    0.004    2.459    2.529
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    2.387    2.388
 cp_fm_redistribute_end              50 14.0    1.964    2.345    1.970    2.347
 potential_pw2rs                    119 12.3    0.006    0.006    2.294    2.305
 cp_fm_diag_elpa_base                50 14.0    0.362    2.211    0.375    2.267
 calculate_dm_sparse                119  9.5    0.000    0.001    2.174    2.260
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.208    2.211
 pw_transfer                       1439 11.6    0.063    0.067    2.017    2.049
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.928    1.973
 jit_kernel_multiply                 10 16.1    0.795    1.967    0.795    1.967
 fft_wrap_pw1pw2                   1201 12.6    0.007    0.008    1.928    1.961
 grid_integrate_task_list           119 12.3    1.825    1.941    1.825    1.941
 prepare_preconditioner              11  7.9    0.000    0.000    1.897    1.936
 make_preconditioner                 11  8.9    0.007    0.037    1.897    1.936
 make_images_data                  4572 15.5    0.045    0.051    1.334    1.863
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.776    1.855
 hybrid_alltoall_any               4725 16.4    0.051    0.112    1.157    1.702
 fft3d_ps                          1201 14.6    0.508    0.563    1.636    1.665
 mp_allgather_i34                  2286 14.5    0.819    1.660    0.819    1.660
 mp_alltoall_d11v                  2130 13.8    1.474    1.651    1.474    1.651
 wfi_extrapolate                     11  7.9    0.001    0.001    1.617    1.617
 acc_transpose_blocks             27432 15.5    0.108    0.113    1.186    1.548
 fft_wrap_pw1pw2_140                487 13.2    0.075    0.081    1.353    1.384
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.325    1.369
 grid_collocate_task_list           119  9.7    1.242    1.342    1.242    1.342
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.304    1.314
 make_images_sizes                 4572 15.5    0.005    0.005    0.920    1.260
 mp_alltoall_i44                   4572 16.5    0.916    1.256    0.916    1.256
 mp_sum_d                          4125 12.0    0.804    1.254    0.804    1.254
 qs_energies_init_hamiltonians       11  5.9    0.022    0.082    1.135    1.137
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.989    1.007
 acc_transpose_blocks_kernels     27432 16.5    0.180    0.270    0.658    0.934
 rs_pw_transfer_PW2RS_50            119 14.3    0.584    0.603    0.888    0.923
 mp_alltoall_z22v                  1201 16.6    0.831    0.901    0.831    0.901
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="102", plot="h2o_64_md", label="(8n/6r/2t)", y=43.097000, yerr=0.000000
PlotPoint: name="103", plot="h2o_64_md_mem", label="(8n/6r/2t)", y=463.636364, yerr=1.966664
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/05/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     59.051995E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3143552       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      46.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             523.100160E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  950976
 MPI messages size (bytes):
  total size                       203.844256E+09
  min size                           0.000000E+00
  max size                           1.638400E+06
  average size                     214.352688E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              179424               2939682816
     32768 < size <=   131072              181440              14863564800
    131072 < size <=  4194304              330176             183964913216
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62660.
 MP_Allreduce        10225                    303.
 MP_Sync               104
 MP_Alltoall          1821                1607811.
 MP_SendRecv         11067                  57667.
 MP_ISendRecv        11067                  57667.
 MP_Wait             21987
 MP_comm_split          50
 MP_ISend             9880                  92618.
 MP_IRecv             9880                  92618.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.165    0.427   37.288   37.292
 qs_mol_dyn_low                       1  2.0    0.003    0.005   36.319   36.328
 qs_forces                           11  3.9    0.021    0.063   36.118   36.119
 qs_energies                         11  4.9    0.002    0.007   34.333   34.356
 scf_env_do_scf                      11  5.9    0.001    0.004   29.108   29.108
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.008   25.491   25.492
 dbcsr_multiply_generic            2286 12.5    0.094    0.098   17.899   18.141
 velocity_verlet                     10  3.0    0.001    0.002   17.901   17.907
 qs_scf_new_mos                     108  7.5    0.001    0.001   16.815   16.833
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   16.815   16.833
 ot_scf_mini                        108  9.5    0.004    0.017   15.999   16.014
 multiply_cannon                   2286 13.5    0.197    0.205   13.654   14.612
 multiply_cannon_loop              2286 14.5    0.636    0.665   12.671   13.765
 ot_mini                            108 10.5    0.001    0.001    9.825    9.853
 qs_ot_get_derivative               108 11.5    0.001    0.001    8.222    8.240
 multiply_cannon_multrec          18288 15.5    1.929    2.890    6.754    7.066
 rebuild_ks_matrix                  119  8.3    0.000    0.000    6.903    6.945
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.015    6.902    6.945
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.996    6.034
 dbcsr_mm_accdrv_process          38222 16.0    4.107    5.369    4.741    5.576
 mp_waitall_1                    158411 16.6    3.692    5.112    3.692    5.112
 sum_up_and_integrate               119 10.3    0.030    0.031    4.411    4.422
 integrate_v_rspace                 119 11.3    0.002    0.003    4.380    4.395
 init_scf_run                        11  5.9    0.000    0.001    3.964    3.965
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    3.964    3.964
 qs_ot_get_p                        119 10.4    0.001    0.001    3.732    3.755
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    3.082    3.711
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.641    3.662
 calculate_rho_elec                 119  8.7    0.030    0.031    3.640    3.661
 init_scf_loop                       11  6.9    0.001    0.005    3.595    3.596
 rs_pw_transfer                     974 11.9    0.009    0.010    2.872    3.034
 multiply_cannon_metrocomm3       18288 15.5    0.045    0.046    1.759    2.974
 prepare_preconditioner              11  7.9    0.000    0.000    2.674    2.677
 make_preconditioner                 11  8.9    0.000    0.001    2.674    2.677
 make_full_inverse_cholesky          11  9.9    0.000    0.000    2.471    2.575
 calculate_first_density_matrix       1  7.0    0.001    0.004    2.470    2.471
 make_m2s                          4572 13.5    0.046    0.047    2.293    2.459
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.094    2.428
 apply_single                       119 13.6    0.000    0.000    2.094    2.428
 qs_ot_p2m_diag                      50 11.0    0.012    0.013    2.372    2.382
 make_images                       4572 14.5    0.193    0.205    2.206    2.372
 density_rs2pw                      119  9.7    0.004    0.004    2.171    2.293
 mp_sum_l                          7207 12.9    1.726    2.230    1.726    2.230
 potential_pw2rs                    119 12.3    0.007    0.007    2.154    2.177
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.051    2.052
 pw_transfer                       1439 11.6    0.064    0.066    1.981    2.017
 calculate_dm_sparse                119  9.5    0.001    0.002    1.945    1.956
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.890    1.928
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.915    1.923
 jit_kernel_multiply                 10 16.1    0.582    1.898    0.582    1.898
 grid_integrate_task_list           119 12.3    1.796    1.873    1.796    1.873
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.799    1.802
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.691    1.695
 cp_fm_redistribute_end              50 14.0    1.256    1.659    1.258    1.660
 fft3d_ps                          1201 14.6    0.508    0.527    1.578    1.616
 cp_fm_diag_elpa_base                50 14.0    0.383    1.555    0.397    1.597
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.551    1.576
 ot_diis_step                       108 11.5    0.011    0.011    1.561    1.562
 multiply_cannon_sync_h2d         18288 15.5    1.339    1.535    1.339    1.535
 wfi_extrapolate                     11  7.9    0.001    0.001    1.446    1.446
 fft_wrap_pw1pw2_140                487 13.2    0.086    0.089    1.390    1.422
 grid_collocate_task_list           119  9.7    1.204    1.285    1.204    1.285
 cp_fm_cholesky_invert               11 10.9    1.259    1.268    1.259    1.268
 make_images_data                  4572 15.5    0.045    0.049    1.053    1.265
 acc_transpose_blocks             18288 15.5    0.075    0.077    1.241    1.263
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.224    1.233
 mp_alltoall_d11v                  2130 13.8    1.035    1.167    1.035    1.167
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.121    1.156
 hybrid_alltoall_any               4725 16.4    0.055    0.113    0.915    1.092
 multiply_cannon_metrocomm1       18288 15.5    0.028    0.029    0.540    1.069
 qs_energies_init_hamiltonians       11  5.9    0.000    0.001    1.009    1.030
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.985    0.995
 mp_alltoall_z22v                  1201 16.6    0.885    0.987    0.885    0.987
 dbcsr_complete_redistribute        329 12.2    0.090    0.097    0.804    0.931
 mp_allgather_i34                  2286 14.5    0.439    0.916    0.439    0.916
 acc_transpose_blocks_kernels     18288 16.5    0.209    0.219    0.801    0.811
 make_images_sizes                 4572 15.5    0.005    0.005    0.550    0.762
 mp_alltoall_i44                   4572 16.5    0.545    0.757    0.545    0.757
 mp_waitany                        9880 13.7    0.607    0.751    0.607    0.751
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="104", plot="h2o_64_md", label="(8n/4r/3t)", y=37.292000, yerr=0.000000
PlotPoint: name="105", plot="h2o_64_md_mem", label="(8n/4r/3t)", y=496.545455, yerr=2.387986
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/06/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    114.044384E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3805952       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      38.6
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             549.756928E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1042416
 MPI messages size (bytes):
  total size                       150.443262E+09
  min size                           0.000000E+00
  max size                           1.188816E+06
  average size                     144.321719E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              228256                        0
       128 < size <=     8192              126888               1039466496
      8192 < size <=    32768              191472               3137077248
     32768 < size <=   131072              295800              25899827200
    131072 < size <=  4194304              200000             120367247040
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62659.
 MP_Allreduce        10224                    344.
 MP_Sync               104
 MP_Alltoall          1582                2412273.
 MP_SendRecv          8211                  74133.
 MP_ISendRecv         8211                  74133.
 MP_Wait             16271
 MP_comm_split          50
 MP_ISend             7280                 135929.
 MP_IRecv             7280                 135929.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.108    0.181   37.477   37.495
 qs_mol_dyn_low                       1  2.0    0.005    0.048   36.626   36.633
 qs_forces                           11  3.9    0.018    0.035   36.564   36.568
 qs_energies                         11  4.9    0.034    0.115   34.785   34.799
 scf_env_do_scf                      11  5.9    0.011    0.035   29.078   29.081
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.008   25.419   25.420
 dbcsr_multiply_generic            2286 12.5    0.099    0.101   18.713   18.836
 velocity_verlet                     10  3.0    0.002    0.002   18.311   18.313
 qs_scf_new_mos                     108  7.5    0.001    0.001   16.830   16.889
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   16.830   16.888
 ot_scf_mini                        108  9.5    0.003    0.004   15.879   15.931
 multiply_cannon                   2286 13.5    0.236    0.281   14.722   15.158
 multiply_cannon_loop              2286 14.5    0.936    0.972   13.702   14.062
 ot_mini                            108 10.5    0.001    0.001    9.797    9.855
 multiply_cannon_multrec          27432 15.5    2.417    3.102    8.737    9.149
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.963    8.019
 dbcsr_mm_accdrv_process          47916 15.9    5.314    7.415    6.225    7.550
 rebuild_ks_matrix                  119  8.3    0.000    0.000    6.764    6.815
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.016    6.764    6.814
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.020    6.064
 init_scf_run                        11  5.9    0.000    0.001    4.034    4.043
 scf_env_initial_rho_setup           11  6.9    0.025    0.039    4.034    4.043
 sum_up_and_integrate               119 10.3    0.035    0.037    3.978    3.986
 integrate_v_rspace                 119 11.3    0.002    0.003    3.943    3.952
 init_scf_loop                       11  6.9    0.003    0.013    3.611    3.613
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.515    3.548
 calculate_rho_elec                 119  8.7    0.040    0.046    3.515    3.548
 qs_ot_get_p                        119 10.4    0.001    0.001    3.442    3.511
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.898    3.347
 mp_waitall_1                    137007 16.6    2.242    2.834    2.242    2.834
 prepare_preconditioner              11  7.9    0.000    0.000    2.700    2.708
 make_preconditioner                 11  8.9    0.003    0.014    2.700    2.708
 make_full_inverse_cholesky          11  9.9    0.000    0.000    2.303    2.630
 calculate_first_density_matrix       1  7.0    0.005    0.028    2.581    2.614
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.117    2.563
 apply_single                       119 13.6    0.000    0.000    2.116    2.563
 rs_pw_transfer                     974 11.9    0.009    0.009    2.373    2.520
 make_m2s                          4572 13.5    0.055    0.057    2.409    2.490
 make_images                       4572 14.5    0.274    0.334    2.300    2.380
 calculate_dm_sparse                119  9.5    0.000    0.000    2.042    2.093
 density_rs2pw                      119  9.7    0.004    0.004    1.968    2.073
 qs_ot_p2m_diag                      50 11.0    0.015    0.023    2.051    2.062
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.025    2.053
 jit_kernel_multiply                 10 15.7    0.850    1.916    0.850    1.916
 pw_transfer                       1439 11.6    0.063    0.066    1.872    1.906
 grid_integrate_task_list           119 12.3    1.821    1.890    1.821    1.890
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.851    1.852
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.782    1.818
 mp_sum_l                          7207 12.9    1.223    1.805    1.223    1.805
 ot_diis_step                       108 11.5    0.012    0.012    1.790    1.790
 potential_pw2rs                    119 12.3    0.008    0.009    1.748    1.759
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.724    1.725
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.643    1.654
 fft3d_ps                          1201 14.6    0.536    0.585    1.464    1.490
 acc_transpose_blocks             27432 15.5    0.112    0.115    1.456    1.487
 multiply_cannon_metrocomm3       27432 15.5    0.038    0.039    0.845    1.457
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.453    1.455
 fft_wrap_pw1pw2_140                487 13.2    0.084    0.092    1.414    1.453
 cp_fm_redistribute_end              50 14.0    0.962    1.426    0.964    1.427
 cp_fm_diag_elpa_base                50 14.0    0.441    1.349    0.460    1.387
 wfi_extrapolate                     11  7.9    0.001    0.001    1.379    1.379
 grid_collocate_task_list           119  9.7    1.222    1.290    1.222    1.290
 qs_energies_init_hamiltonians       11  5.9    0.003    0.010    1.238    1.257
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.211    1.221
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.142    1.157
 cp_fm_upper_to_full                 72 13.5    0.805    1.114    0.805    1.114
 dbcsr_complete_redistribute        329 12.2    0.127    0.156    0.833    1.107
 make_images_data                  4572 15.5    0.045    0.049    0.980    1.090
 multiply_cannon_sync_h2d         27432 15.5    0.980    1.079    0.980    1.079
 hybrid_alltoall_any               4725 16.4    0.061    0.151    0.847    0.996
 mp_alltoall_d11v                  2130 13.8    0.849    0.925    0.849    0.925
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.795    0.874
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    0.602    0.869
 cp_fm_cholesky_invert               11 10.9    0.853    0.856    0.853    0.856
 acc_transpose_blocks_kernels     27432 16.5    0.266    0.274    0.833    0.848
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.837    0.842
 mp_alltoall_z22v                  1201 16.6    0.810    0.841    0.810    0.841
 mp_alltoall_i22                    627 13.8    0.466    0.770    0.466    0.770
 mp_sum_d                          4123 12.0    0.576    0.758    0.576    0.758
 qs_env_update_s_mstruct             11  6.9    0.001    0.009    0.705    0.755
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="106", plot="h2o_64_md", label="(8n/3r/4t)", y=37.495000, yerr=0.000000
PlotPoint: name="107", plot="h2o_64_md_mem", label="(8n/3r/4t)", y=522.545455, yerr=2.641062
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/07/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    117.977176E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1384136       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     106.2
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             608.665600E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  219456
 MPI messages size (bytes):
  total size                        97.042514E+09
  min size                           0.000000E+00
  max size                           3.276800E+06
  average size                     442.195750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              101892               3336634368
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304              116112              93705670464
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8156                     20.
 MP_Alltoall          8655                  64935.
 MP_ISend            36532                 168375.
 MP_IRecv            36532                 168349.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62658.
 MP_Allreduce        10224                    344.
 MP_Sync               104
 MP_Alltoall          1582                3682667.
 MP_SendRecv          5355                  94533.
 MP_ISendRecv         5355                  94533.
 MP_Wait             11335
 MP_comm_split          50
 MP_ISend             5200                 225425.
 MP_IRecv             5200                 225425.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.039    0.058   29.244   29.245
 qs_mol_dyn_low                       1  2.0    0.003    0.003   28.977   28.984
 qs_forces                           11  3.9    0.004    0.005   28.897   28.898
 qs_energies                         11  4.9    0.001    0.001   27.174   27.177
 scf_env_do_scf                      11  5.9    0.000    0.001   22.262   22.263
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   19.591   19.592
 velocity_verlet                     10  3.0    0.009    0.011   14.954   14.958
 dbcsr_multiply_generic            2286 12.5    0.091    0.093   12.581   12.662
 qs_scf_new_mos                     108  7.5    0.001    0.001   11.681   11.706
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   11.680   11.705
 ot_scf_mini                        108  9.5    0.002    0.002   10.972   11.003
 multiply_cannon                   2286 13.5    0.231    0.239    9.715   10.236
 multiply_cannon_loop              2286 14.5    0.329    0.342    8.737    8.955
 ot_mini                            108 10.5    0.001    0.001    5.993    6.032
 rebuild_ks_matrix                  119  8.3    0.000    0.000    5.995    6.021
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    5.995    6.021
 multiply_cannon_multrec           9144 15.5    1.586    1.822    5.757    5.982
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.359    5.382
 qs_ot_get_derivative               108 11.5    0.001    0.001    4.669    4.703
 dbcsr_mm_accdrv_process          12550 15.8    3.053    3.601    4.068    4.156
 sum_up_and_integrate               119 10.3    0.037    0.041    3.695    3.699
 integrate_v_rspace                 119 11.3    0.002    0.003    3.658    3.662
 init_scf_run                        11  5.9    0.000    0.001    3.468    3.468
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    3.468    3.468
 qs_rho_update_rho_low              119  7.7    0.001    0.001    3.427    3.437
 calculate_rho_elec                 119  8.7    0.059    0.061    3.426    3.436
 qs_ot_get_p                        119 10.4    0.001    0.001    3.118    3.158
 init_scf_loop                       11  6.9    0.000    0.000    2.647    2.648
 mp_waitall_1                    115863 16.7    1.821    2.465    1.821    2.465
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.218    2.219
 make_m2s                          4572 13.5    0.036    0.036    1.972    2.127
 qs_ot_p2m_diag                      50 11.0    0.022    0.023    2.070    2.072
 make_images                       4572 14.5    0.270    0.308    1.881    2.034
 rs_pw_transfer                     974 11.9    0.008    0.008    1.907    1.989
 grid_integrate_task_list           119 12.3    1.845    1.927    1.845    1.927
 prepare_preconditioner              11  7.9    0.000    0.000    1.887    1.892
 make_preconditioner                 11  8.9    0.000    0.000    1.887    1.892
 density_rs2pw                      119  9.7    0.003    0.004    1.817    1.891
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.853    1.853
 pw_transfer                       1439 11.6    0.064    0.065    1.818    1.826
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.771    1.806
 calculate_dm_sparse                119  9.5    0.000    0.000    1.763    1.782
 jit_kernel_multiply                 10 15.8    0.976    1.777    0.976    1.777
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    1.727    1.735
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    1.613    1.631
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.558    1.561
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.507    1.508
 cp_fm_redistribute_end              50 14.0    0.752    1.480    0.753    1.481
 potential_pw2rs                    119 12.3    0.010    0.010    1.469    1.474
 cp_fm_diag_elpa_base                50 14.0    0.683    1.406    0.726    1.460
 fft3d_ps                          1201 14.6    0.543    0.555    1.395    1.403
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.373    1.382
 grid_collocate_task_list           119  9.7    1.271    1.377    1.271    1.377
 fft_wrap_pw1pw2_140                487 13.2    0.082    0.085    1.365    1.374
 ot_diis_step                       108 11.5    0.012    0.013    1.308    1.309
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.245    1.261
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    1.226    1.228
 wfi_extrapolate                     11  7.9    0.001    0.001    1.201    1.202
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    1.180    1.202
 apply_single                       119 13.6    0.000    0.000    1.179    1.201
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.153    1.158
 hybrid_alltoall_any               4725 16.4    0.062    0.176    0.889    1.142
 make_images_data                  4572 15.5    0.039    0.043    0.928    1.131
 mp_alltoall_d11v                  2130 13.8    0.907    1.035    0.907    1.035
 cp_fm_cholesky_invert               11 10.9    0.966    0.970    0.966    0.970
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.869    0.922
 acc_transpose_blocks              9144 15.5    0.038    0.039    0.888    0.895
 multiply_cannon_sync_h2d          9144 15.5    0.712    0.799    0.712    0.799
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    0.776    0.786
 mp_allgather_i34                  2286 14.5    0.313    0.763    0.313    0.763
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.759    0.763
 mp_alltoall_z22v                  1201 16.6    0.726    0.761    0.726    0.761
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    0.679    0.721
 mp_sum_l                          7207 12.9    0.494    0.689    0.494    0.689
 multiply_cannon_metrocomm3        9144 15.5    0.019    0.019    0.346    0.685
 acc_transpose_blocks_kernels      9144 16.5    0.115    0.118    0.649    0.652
 dbcsr_complete_redistribute        329 12.2    0.160    0.170    0.594    0.626
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="108", plot="h2o_64_md", label="(8n/2r/6t)", y=29.245000, yerr=0.000000
PlotPoint: name="109", plot="h2o_64_md_mem", label="(8n/2r/6t)", y=573.909091, yerr=8.050461
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/08/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    235.585836E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1388964       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     105.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             753.713152E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   91440
 MPI messages size (bytes):
  total size                        85.748679E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     937.758938E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               21148                692256768
     32768 < size <=   131072               19224               1259864064
    131072 < size <=  4194304               41040              21941452800
   4194304 < size <= 16777216                9456              61855174464
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63729.
 MP_Allreduce        10074                    433.
 MP_Sync                54
 MP_Alltoall          1582                7383731.
 MP_SendRecv          2499                 189067.
 MP_ISendRecv         2499                 189067.
 MP_Wait              6399
 MP_ISend             3120                 546875.
 MP_IRecv             3120                 546875.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.127    0.143   45.488   45.489
 qs_mol_dyn_low                       1  2.0    0.003    0.003   44.537   44.546
 qs_forces                           11  3.9    0.002    0.002   44.473   44.474
 qs_energies                         11  4.9    0.001    0.001   42.410   42.414
 scf_env_do_scf                      11  5.9    0.001    0.001   35.618   35.618
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.006   27.360   27.362
 velocity_verlet                     10  3.0    0.002    0.002   24.739   24.745
 dbcsr_multiply_generic            2286 12.5    0.107    0.112   19.135   19.421
 qs_scf_new_mos                     108  7.5    0.001    0.001   17.495   17.586
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   17.494   17.585
 ot_scf_mini                        108  9.5    0.002    0.002   16.336   16.434
 multiply_cannon                   2286 13.5    0.304    0.312   14.445   15.559
 multiply_cannon_loop              2286 14.5    0.343    0.351   13.021   14.089
 ot_mini                            108 10.5    0.001    0.001    9.481    9.599
 multiply_cannon_multrec           9144 15.5    3.301    4.717    8.743    8.907
 init_scf_loop                       11  6.9    0.000    0.000    8.222    8.226
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.449    7.596
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.013    7.449    7.596
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.379    7.473
 prepare_preconditioner              11  7.9    0.000    0.000    7.175    7.189
 make_preconditioner                 11  8.9    0.000    0.000    7.175    7.189
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.679    7.040
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.705    6.840
 dbcsr_mm_accdrv_process          12550 15.8    4.294    5.698    5.317    6.630
 cp_fm_upper_to_full                 72 14.2    3.173    4.586    3.173    4.586
 init_scf_run                        11  5.9    0.000    0.001    4.494    4.494
 scf_env_initial_rho_setup           11  6.9    0.002    0.003    4.494    4.494
 mp_waitall_1                     94719 16.7    3.270    4.405    3.270    4.405
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.225    4.264
 calculate_rho_elec                 119  8.7    0.118    0.121    4.224    4.264
 sum_up_and_integrate               119 10.3    0.064    0.065    4.064    4.071
 integrate_v_rspace                 119 11.3    0.003    0.003    4.000    4.007
 qs_ot_get_p                        119 10.4    0.001    0.001    3.858    4.003
 make_m2s                          4572 13.5    0.039    0.040    2.887    3.134
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.696    3.111
 make_images                       4572 14.5    0.355    0.383    2.764    3.011
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.984    2.991
 dbcsr_complete_redistribute        329 12.2    0.292    0.304    2.147    2.988
 multiply_cannon_metrocomm3        9144 15.5    0.020    0.020    1.659    2.640
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    1.792    2.622
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.252    2.531
 apply_single                       119 13.6    0.000    0.000    2.252    2.531
 calculate_dm_sparse                119  9.5    0.000    0.000    2.453    2.479
 pw_transfer                       1439 11.6    0.066    0.067    2.405    2.412
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    2.310    2.316
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.491    2.315
 mp_alltoall_i22                    627 13.8    1.462    2.308    1.462    2.308
 qs_ot_p2m_diag                      50 11.0    0.043    0.044    2.288    2.289
 density_rs2pw                      119  9.7    0.003    0.003    2.242    2.263
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.190    2.191
 ot_diis_step                       108 11.5    0.014    0.015    2.062    2.063
 grid_integrate_task_list           119 12.3    2.031    2.056    2.031    2.056
 mp_sum_l                          7207 12.9    1.277    2.025    1.277    2.025
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.989    1.989
 qs_energies_init_hamiltonians       11  5.9    0.020    0.026    1.987    1.988
 rs_pw_transfer                     974 11.9    0.009    0.009    1.914    1.953
 fft3d_ps                          1201 14.6    0.581    0.591    1.944    1.952
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.877    1.928
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.857    1.906
 fft_wrap_pw1pw2_140                487 13.2    0.088    0.091    1.857    1.866
 make_images_data                  4572 15.5    0.043    0.046    1.439    1.772
 hybrid_alltoall_any               4725 16.4    0.087    0.149    1.424    1.730
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.659    1.659
 cp_fm_diag_elpa_base                50 14.0    1.513    1.567    1.657    1.657
 cp_fm_cholesky_invert               11 10.9    1.631    1.634    1.631    1.634
 potential_pw2rs                    119 12.3    0.014    0.015    1.497    1.503
 grid_collocate_task_list           119  9.7    1.468    1.489    1.468    1.489
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.465    1.479
 mp_alltoall_d11v                  2130 13.8    1.415    1.460    1.415    1.460
 wfi_extrapolate                     11  7.9    0.001    0.001    1.433    1.434
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.283    1.308
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    1.282    1.300
 mp_alltoall_z22v                  1201 16.6    1.226    1.248    1.226    1.248
 jit_kernel_multiply                  6 15.7    0.995    1.160    0.995    1.160
 multiply_cannon_sync_h2d          9144 15.5    1.044    1.049    1.044    1.049
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.988    1.042
 acc_transpose_blocks              9144 15.5    0.038    0.038    1.019    1.025
 qs_create_task_list                 11  7.9    0.049    0.067    1.011    1.023
 generate_qs_task_list               11  8.9    0.374    0.393    0.961    0.991
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.961    0.974
 mp_allgather_i34                  2286 14.5    0.385    0.930    0.385    0.930
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="110", plot="h2o_64_md", label="(8n/1r/12t)", y=45.489000, yerr=0.000000
PlotPoint: name="111", plot="h2o_64_md_mem", label="(8n/1r/12t)", y=703.636364, yerr=17.379574
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/09/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    198.287135E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               8410880       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     117.0
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             500.137984E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 8483040
 MPI messages size (bytes):
  total size                         1.160510E+12
  min size                           0.000000E+00
  max size                           1.161504E+06
  average size                     136.803609E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             1836752                        0
       128 < size <=     8192             1040592               8524529664
      8192 < size <=    32768             1486976              24362614784
     32768 < size <=   131072             2491776             216971345920
    131072 < size <=  4194304             1626944             910632720448
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3521                  65372.
 MP_Allreduce         9840                    486.
 MP_Sync               100
 MP_Alltoall          1938                1383689.
 MP_SendRecv         20900                   9096.
 MP_ISendRecv        20900                   9096.
 MP_Wait             37268
 MP_comm_split          48
 MP_ISend            14300                  82312.
 MP_IRecv            14300                  82312.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.058    0.098   95.723   95.729
 qs_mol_dyn_low                       1  2.0    0.003    0.003   94.691   94.711
 qs_forces                           11  3.9    0.019    0.033   94.584   94.586
 qs_energies                         11  4.9    0.001    0.002   90.805   90.842
 scf_env_do_scf                      11  5.9    0.000    0.001   80.485   80.487
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   74.035   74.036
 dbcsr_multiply_generic            2055 12.4    0.107    0.112   56.564   56.841
 qs_scf_new_mos                      99  7.5    0.000    0.001   54.730   54.842
 qs_scf_loop_do_ot                   99  8.5    0.000    0.001   54.730   54.842
 ot_scf_mini                         99  9.5    0.002    0.002   52.087   52.181
 velocity_verlet                     10  3.0    0.001    0.002   48.094   48.096
 multiply_cannon                   2055 13.4    0.182    0.192   44.371   45.060
 multiply_cannon_loop              2055 14.4    1.541    1.583   42.740   43.339
 ot_mini                             99 10.5    0.001    0.001   31.157   31.251
 qs_ot_get_derivative                99 11.5    0.001    0.001   24.204   24.292
 multiply_cannon_multrec          49320 15.4   11.914   12.472   16.960   17.723
 rebuild_ks_matrix                  110  8.3    0.000    0.001   15.662   15.763
 qs_ks_build_kohn_sham_matrix       110  9.3    0.011    0.012   15.661   15.762
 mp_waitall_1                    220248 16.4   14.245   15.262   14.245   15.262
 qs_ks_update_qs_env                110  7.6    0.001    0.001   13.371   13.467
 qs_ot_get_p                        110 10.4    0.001    0.001   11.648   11.807
 multiply_cannon_sync_h2d         49320 15.4    9.652   10.253    9.652   10.253
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    8.081    8.620
 multiply_cannon_metrocomm3       49320 15.4    0.077    0.081    7.364    8.422
 qs_rho_update_rho_low              110  7.6    0.000    0.001    7.999    8.154
 calculate_rho_elec                 110  8.6    0.020    0.024    7.999    8.154
 sum_up_and_integrate               110 10.3    0.036    0.043    7.909    7.926
 integrate_v_rspace                 110 11.3    0.002    0.003    7.873    7.899
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    7.803    7.851
 init_scf_run                        11  5.9    0.000    0.001    7.764    7.764
 scf_env_initial_rho_setup           11  6.9    0.009    0.025    7.764    7.764
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    7.258    7.723
 apply_single                       110 13.6    0.000    0.000    7.258    7.723
 qs_ot_p2m_diag                      48 11.0    0.012    0.023    7.199    7.254
 mp_sum_l                          6514 12.8    6.091    6.845    6.091    6.845
 ot_diis_step                        99 11.5    0.005    0.006    6.662    6.662
 init_scf_loop                       11  6.9    0.000    0.000    6.417    6.417
 cp_dbcsr_syevd                      48 12.0    0.002    0.003    5.970    5.974
 make_m2s                          4110 13.4    0.063    0.067    5.682    5.821
 make_images                       4110 14.4    0.179    0.192    5.584    5.725
 rs_pw_transfer                     902 11.9    0.011    0.013    5.278    5.555
 multiply_cannon_metrocomm1       49320 15.4    0.059    0.063    3.971    5.237
 dbcsr_mm_accdrv_process          87628 16.1    2.067    2.149    4.928    5.222
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    5.089    5.094
 cp_fm_redistribute_end              48 14.0    4.420    5.036    4.426    5.037
 density_rs2pw                      110  9.6    0.004    0.004    4.838    4.982
 cp_fm_diag_elpa_base                48 14.0    0.601    4.839    0.605    4.861
 wfi_extrapolate                     11  7.9    0.001    0.001    4.657    4.657
 pw_transfer                       1331 11.6    0.054    0.063    4.167    4.319
 calculate_dm_sparse                110  9.5    0.021    0.083    4.196    4.310
 prepare_preconditioner              11  7.9    0.000    0.000    4.224    4.236
 make_preconditioner                 11  8.9    0.000    0.000    4.224    4.236
 fft_wrap_pw1pw2                   1111 12.6    0.007    0.008    4.079    4.235
 make_full_inverse_cholesky          11  9.9    0.000    0.000    3.991    4.032
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    3.895    3.954
 fft3d_ps                          1111 14.6    0.754    0.841    3.621    3.761
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    3.755    3.759
 fft_wrap_pw1pw2_140                451 13.1    0.166    0.186    3.362    3.518
 mp_alltoall_d11v                  2046 13.8    3.036    3.512    3.036    3.512
 potential_pw2rs                    110 12.3    0.006    0.007    3.383    3.424
 grid_integrate_task_list           110 12.3    3.233    3.396    3.233    3.396
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.238    3.289
 calculate_first_density_matrix       1  7.0    0.009    0.037    3.012    3.026
 make_images_data                  4110 15.4    0.042    0.045    2.431    2.729
 mp_alltoall_z22v                  1111 16.6    2.452    2.656    2.452    2.656
 jit_kernel_multiply                 13 15.9    2.573    2.597    2.573    2.597
 hybrid_alltoall_any               4261 16.3    0.080    0.469    2.157    2.456
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    2.436    2.449
 mp_waitany                       14300 13.8    2.075    2.446    2.075    2.446
 acc_transpose_blocks             49320 15.4    0.225    0.237    2.183    2.271
 make_images_sizes                 4110 15.4    0.004    0.004    1.678    2.253
 mp_alltoall_i44                   4110 16.4    1.674    2.249    1.674    2.249
 mp_sum_d                          3879 11.9    1.764    2.228    1.764    2.228
 grid_collocate_task_list           110  9.6    2.088    2.208    2.088    2.208
 cp_fm_cholesky_invert               11 10.9    2.005    2.009    2.005    2.009
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="200", plot="h2o_128_md", label="(8n/12r/1t)", y=95.729000, yerr=0.000000
PlotPoint: name="201", plot="h2o_128_md_mem", label="(8n/12r/1t)", y=475.545455, yerr=2.061052
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/10/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    390.715586E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               5019072       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     196.1
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             585.560064E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1972800
 MPI messages size (bytes):
  total size                         1.077520E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     546.188250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192              222984               1826684928
      8192 < size <=    32768              520356              13399818240
     32768 < size <=   131072              372336              35386294272
    131072 < size <=  4194304              787758             788321309808
   4194304 < size <= 16777216               54450             238588003280
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3521                  65587.
 MP_Allreduce         9839                    562.
 MP_Sync               100
 MP_Alltoall          1717                1995690.
 MP_SendRecv         10340                  26400.
 MP_ISendRecv        10340                  26400.
 MP_Wait             22352
 MP_comm_split          48
 MP_ISend            10164                 155761.
 MP_IRecv            10164                 155761.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.106    0.171   79.254   79.255
 qs_mol_dyn_low                       1  2.0    0.003    0.012   78.225   78.238
 qs_forces                           11  3.9    0.021    0.034   78.146   78.147
 qs_energies                         11  4.9    0.002    0.009   74.686   74.702
 scf_env_do_scf                      11  5.9    0.001    0.002   63.968   63.971
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.007   54.389   54.389
 velocity_verlet                     10  3.0    0.001    0.002   40.961   40.962
 dbcsr_multiply_generic            2055 12.4    0.114    0.120   40.485   40.905
 qs_scf_new_mos                      99  7.5    0.001    0.001   35.979   36.085
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   35.979   36.084
 ot_scf_mini                         99  9.5    0.003    0.004   34.239   34.347
 multiply_cannon                   2055 13.4    0.221    0.243   32.361   33.623
 multiply_cannon_loop              2055 14.4    0.924    0.949   30.766   31.976
 ot_mini                             99 10.5    0.001    0.001   20.011   20.130
 rebuild_ks_matrix                  110  8.3    0.000    0.001   15.758   15.914
 qs_ks_build_kohn_sham_matrix       110  9.3    0.012    0.015   15.758   15.913
 multiply_cannon_multrec          24660 15.4    7.499    9.306   13.674   15.451
 qs_ks_update_qs_env                110  7.6    0.001    0.001   14.030   14.166
 qs_ot_get_derivative                99 11.5    0.001    0.001   13.826   13.931
 mp_waitall_1                    176588 16.5   10.326   13.247   10.326   13.247
 init_scf_loop                       11  6.9    0.001    0.005    9.541    9.541
 multiply_cannon_metrocomm3       24660 15.4    0.070    0.073    6.206    9.415
 sum_up_and_integrate               110 10.3    0.053    0.059    8.337    8.348
 integrate_v_rspace                 110 11.3    0.002    0.002    8.284    8.296
 multiply_cannon_sync_h2d         24660 15.4    6.937    7.935    6.937    7.935
 init_scf_run                        11  5.9    0.000    0.001    7.856    7.857
 scf_env_initial_rho_setup           11  6.9    0.011    0.014    7.856    7.856
 prepare_preconditioner              11  7.9    0.000    0.000    7.446    7.465
 make_preconditioner                 11  8.9    0.000    0.002    7.446    7.465
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    6.702    7.443
 apply_single                       110 13.6    0.000    0.001    6.702    7.443
 make_full_inverse_cholesky          11  9.9    0.000    0.000    7.020    7.186
 qs_ot_get_p                        110 10.4    0.001    0.001    6.984    7.102
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.382    6.400
 calculate_rho_elec                 110  8.6    0.039    0.047    6.381    6.399
 dbcsr_mm_accdrv_process          52282 16.1    4.717    5.391    6.017    6.293
 ot_diis_step                        99 11.5    0.010    0.010    6.080    6.081
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    4.904    5.649
 make_m2s                          4110 13.4    0.058    0.061    5.046    5.440
 make_images                       4110 14.4    0.401    0.448    4.935    5.329
 qs_ot_p2m_diag                      48 11.0    0.028    0.044    4.835    4.876
 rs_pw_transfer                     902 11.9    0.012    0.013    4.455    4.636
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.190    4.193
 pw_transfer                       1331 11.6    0.066    0.070    3.972    4.146
 wfi_extrapolate                     11  7.9    0.001    0.001    4.115    4.115
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.865    4.041
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    3.900    3.957
 potential_pw2rs                    110 12.3    0.008    0.008    3.899    3.947
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    3.793    3.796
 density_rs2pw                      110  9.6    0.004    0.004    3.585    3.742
 calculate_first_density_matrix       1  7.0    0.001    0.003    3.636    3.650
 cp_fm_cholesky_invert               11 10.9    3.533    3.542    3.533    3.542
 fft3d_ps                          1111 14.6    1.078    1.272    3.223    3.388
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.335    3.340
 mp_sum_l                          6514 12.8    2.509    3.311    2.509    3.311
 cp_fm_redistribute_end              48 14.0    2.504    3.307    2.508    3.309
 grid_integrate_task_list           110 12.3    3.118    3.291    3.118    3.291
 make_images_data                  4110 15.4    0.047    0.050    2.826    3.269
 cp_fm_diag_elpa_base                48 14.0    0.763    3.106    0.794    3.192
 hybrid_alltoall_any               4261 16.3    0.101    0.439    2.486    3.180
 fft_wrap_pw1pw2_140                451 13.1    0.199    0.217    3.007    3.175
 calculate_dm_sparse                110  9.5    0.001    0.001    3.069    3.111
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.993    3.068
 mp_alltoall_d11v                  2046 13.8    2.636    2.931    2.636    2.931
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    2.363    2.399
 grid_collocate_task_list           110  9.6    2.126    2.287    2.126    2.287
 mp_allgather_i34                  2055 14.4    0.922    2.189    0.922    2.189
 cp_fm_cholesky_decompose            22 10.9    2.147    2.157    2.147    2.157
 qs_energies_init_hamiltonians       11  5.9    0.006    0.011    2.085    2.105
 dbcsr_complete_redistribute        325 12.2    0.278    0.404    1.808    2.102
 parallel_gemm_fm                    81  9.0    0.000    0.000    1.986    1.997
 parallel_gemm_fm_cosma              81 10.0    1.986    1.997    1.986    1.997
 make_basis_sm                       11  9.8    0.000    0.001    1.966    1.968
 jit_kernel_multiply                 10 16.4    0.944    1.943    0.944    1.943
 mp_alltoall_z22v                  1111 16.6    1.859    1.941    1.859    1.941
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.897    1.914
 mp_sum_d                          3879 11.9    1.281    1.776    1.281    1.776
 multiply_cannon_metrocomm4       22605 15.4    0.072    0.077    0.807    1.675
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.643    1.669
 mp_waitany                       10164 13.8    1.471    1.642    1.471    1.642
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="202", plot="h2o_128_md", label="(8n/6r/2t)", y=79.255000, yerr=0.000000
PlotPoint: name="203", plot="h2o_128_md_mem", label="(8n/6r/2t)", y=554.272727, yerr=7.097899
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/11/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    404.681598E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               3346752       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     294.1
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             662.646784E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  854880
 MPI messages size (bytes):
  total size                       708.322787E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     828.564000E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              222984               7302414336
     32768 < size <=   131072              153888              10085203968
    131072 < size <=  4194304              389376             200257044480
   4194304 < size <= 16777216               82208             490679162176
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3521                  65578.
 MP_Allreduce         9838                    559.
 MP_Sync               100
 MP_Alltoall          1496                4511006.
 MP_SendRecv          6820                  27424.
 MP_ISendRecv         6820                  27424.
 MP_Wait             25498
 MP_comm_split          48
 MP_ISend            17072                 115022.
 MP_IRecv            17072                 115022.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.031    0.072   64.376   64.377
 qs_mol_dyn_low                       1  2.0    0.003    0.006   63.985   63.998
 qs_forces                           11  3.9    0.002    0.005   63.664   63.665
 qs_energies                         11  4.9    0.004    0.023   60.422   60.430
 scf_env_do_scf                      11  5.9    0.001    0.001   51.949   51.949
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   42.502   42.502
 velocity_verlet                     10  3.0    0.011    0.037   34.881   34.890
 dbcsr_multiply_generic            2055 12.4    0.109    0.111   30.105   30.493
 qs_scf_new_mos                      99  7.5    0.001    0.001   27.356   27.452
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   27.356   27.451
 ot_scf_mini                         99  9.5    0.002    0.003   26.070   26.179
 multiply_cannon                   2055 13.4    0.211    0.222   22.868   24.275
 multiply_cannon_loop              2055 14.4    0.615    0.628   21.493   22.556
 ot_mini                             99 10.5    0.001    0.001   14.685   14.804
 rebuild_ks_matrix                  110  8.3    0.000    0.000   12.662   12.828
 qs_ks_build_kohn_sham_matrix       110  9.3    0.016    0.043   12.662   12.827
 mp_waitall_1                    139946 16.5    8.272   11.537    8.272   11.537
 qs_ks_update_qs_env                110  7.6    0.001    0.001   11.194   11.337
 multiply_cannon_multrec          16440 15.4    3.719    4.500    9.459   10.144
 qs_ot_get_derivative                99 11.5    0.001    0.001   10.031   10.139
 init_scf_loop                       11  6.9    0.001    0.006    9.403    9.404
 multiply_cannon_metrocomm3       16440 15.4    0.044    0.046    5.041    8.034
 prepare_preconditioner              11  7.9    0.000    0.000    7.521    7.541
 make_preconditioner                 11  8.9    0.000    0.003    7.521    7.541
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.819    7.195
 sum_up_and_integrate               110 10.3    0.060    0.061    6.757    6.770
 integrate_v_rspace                 110 11.3    0.002    0.003    6.696    6.709
 qs_ot_get_p                        110 10.4    0.001    0.001    6.326    6.476
 init_scf_run                        11  5.9    0.000    0.001    5.874    5.874
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    5.874    5.874
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.811    5.817
 calculate_rho_elec                 110  8.6    0.058    0.058    5.810    5.817
 dbcsr_mm_accdrv_process          34862 16.1    4.636    5.272    5.597    5.694
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    5.123    5.579
 apply_single                       110 13.6    0.000    0.000    5.123    5.578
 make_m2s                          4110 13.4    0.051    0.053    4.731    5.194
 make_images                       4110 14.4    0.395    0.517    4.614    5.077
 ot_diis_step                        99 11.5    0.010    0.011    4.602    4.603
 qs_ot_p2m_diag                      48 11.0    0.042    0.044    4.406    4.413
 multiply_cannon_sync_h2d         16440 15.4    3.656    4.184    3.656    4.184
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.424    4.045
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.015    4.015
 grid_integrate_task_list           110 12.3    3.176    3.465    3.176    3.465
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.385    3.390
 pw_transfer                       1331 11.6    0.065    0.071    3.362    3.374
 cp_fm_redistribute_end              48 14.0    2.105    3.339    2.108    3.341
 rs_pw_transfer                     902 11.9    0.010    0.011    3.068    3.325
 make_images_data                  4110 15.4    0.043    0.048    2.777    3.312
 density_rs2pw                      110  9.6    0.004    0.004    3.066    3.284
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.256    3.270
 cp_fm_diag_elpa_base                48 14.0    1.160    3.158    1.224    3.269
 hybrid_alltoall_any               4261 16.3    0.105    0.374    2.433    3.143
 wfi_extrapolate                     11  7.9    0.001    0.001    3.137    3.137
 cp_fm_cholesky_invert               11 10.9    3.073    3.083    3.073    3.083
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.913    2.915
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.768    2.839
 mp_sum_l                          6514 12.8    1.911    2.755    1.911    2.755
 fft_wrap_pw1pw2_140                451 13.1    0.209    0.214    2.692    2.706
 calculate_first_density_matrix       1  7.0    0.001    0.004    2.649    2.651
 calculate_dm_sparse                110  9.5    0.001    0.001    2.568    2.606
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.534    2.593
 fft3d_ps                          1111 14.6    1.068    1.077    2.572    2.585
 potential_pw2rs                    110 12.3    0.010    0.010    2.496    2.504
 multiply_cannon_metrocomm4       14385 15.4    0.045    0.049    0.929    2.466
 mp_alltoall_d11v                  2046 13.8    2.090    2.453    2.090    2.453
 mp_irecv_dv                      48980 15.7    0.858    2.338    0.858    2.338
 grid_collocate_task_list           110  9.6    2.170    2.336    2.170    2.336
 dbcsr_complete_redistribute        325 12.2    0.317    0.331    1.590    2.056
 qs_energies_init_hamiltonians       11  5.9    0.000    0.002    2.053    2.055
 mp_allgather_i34                  2055 14.4    0.700    1.956    0.700    1.956
 cp_fm_upper_to_full                 70 13.6    1.426    1.899    1.426    1.899
 cp_fm_cholesky_decompose            22 10.9    1.824    1.843    1.824    1.843
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.672    1.694
 mp_waitany                       17072 13.8    1.290    1.587    1.290    1.587
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    1.068    1.535
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.488    1.504
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.001    1.370    1.494
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    1.386    1.397
 rs_gather_matrices                 110 12.3    0.138    0.151    0.968    1.335
 mp_alltoall_z22v                  1111 16.6    1.290    1.313    1.290    1.313
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="204", plot="h2o_128_md", label="(8n/4r/3t)", y=64.377000, yerr=0.000000
PlotPoint: name="205", plot="h2o_128_md_mem", label="(8n/4r/3t)", y=625.363636, yerr=8.844852
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/12/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    601.317074E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               4916280       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     200.2
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             736.083968E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  937080
 MPI messages size (bytes):
  total size                       523.723932E+09
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     558.889250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                 264                  2162688
      8192 < size <=    32768              304932               8165326848
     32768 < size <=   131072              110640               6338641920
    131072 < size <=  4194304              489498             400769458320
   4194304 < size <= 16777216               24750             108449092400
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3521                  65576.
 MP_Allreduce         9838                    600.
 MP_Sync               100
 MP_Alltoall          1496                5863162.
 MP_SendRecv          5060                  43184.
 MP_ISendRecv         5060                  43184.
 MP_Wait             20042
 MP_comm_split          48
 MP_ISend            13376                 163145.
 MP_IRecv            13376                 163145.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.033    0.096   67.200   67.202
 qs_mol_dyn_low                       1  2.0    0.004    0.005   66.760   66.769
 qs_forces                           11  3.9    0.006    0.032   66.685   66.687
 qs_energies                         11  4.9    0.003    0.008   63.224   63.233
 scf_env_do_scf                      11  5.9    0.001    0.001   54.129   54.133
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.006   41.921   41.921
 velocity_verlet                     10  3.0    0.002    0.002   37.719   37.721
 dbcsr_multiply_generic            2055 12.4    0.118    0.129   30.352   30.578
 qs_scf_new_mos                      99  7.5    0.001    0.001   27.205   27.300
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   27.205   27.300
 ot_scf_mini                         99  9.5    0.003    0.004   25.549   25.632
 multiply_cannon                   2055 13.4    0.242    0.262   22.913   24.112
 multiply_cannon_loop              2055 14.4    0.884    0.902   21.485   22.126
 ot_mini                             99 10.5    0.001    0.001   14.540   14.654
 multiply_cannon_multrec          24660 15.4    4.154    6.420   12.824   13.894
 rebuild_ks_matrix                  110  8.3    0.000    0.000   12.104   12.223
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.026   12.104   12.223
 init_scf_loop                       11  6.9    0.001    0.006   12.167   12.168
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.707   10.812
 prepare_preconditioner              11  7.9    0.000    0.000   10.398   10.412
 make_preconditioner                 11  8.9    0.000    0.001   10.398   10.412
 qs_ot_get_derivative                99 11.5    0.001    0.001   10.283   10.374
 make_full_inverse_cholesky          11  9.9    0.000    0.000    8.605   10.056
 dbcsr_mm_accdrv_process          52304 16.0    6.799    8.008    8.524    9.545
 mp_waitall_1                    121746 16.5    4.811    6.952    4.811    6.952
 sum_up_and_integrate               110 10.3    0.068    0.071    6.463    6.476
 integrate_v_rspace                 110 11.3    0.002    0.003    6.396    6.410
 init_scf_run                        11  5.9    0.000    0.001    6.149    6.150
 scf_env_initial_rho_setup           11  6.9    0.002    0.005    6.149    6.150
 qs_ot_get_p                        110 10.4    0.001    0.001    5.958    6.113
 make_m2s                          4110 13.4    0.061    0.063    5.704    6.017
 make_images                       4110 14.4    0.577    0.698    5.561    5.871
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.811    5.819
 calculate_rho_elec                 110  8.6    0.077    0.081    5.810    5.818
 cp_fm_upper_to_full                 70 13.8    3.265    4.644    3.265    4.644
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.090    4.217
 apply_single                       110 13.6    0.000    0.000    4.090    4.217
 ot_diis_step                        99 11.5    0.011    0.012    4.212    4.212
 qs_ot_p2m_diag                      48 11.0    0.055    0.064    4.074    4.089
 dbcsr_complete_redistribute        325 12.2    0.420    0.474    2.749    3.875
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.576    3.577
 multiply_cannon_metrocomm3       24660 15.4    0.036    0.038    1.638    3.544
 grid_integrate_task_list           110 12.3    3.266    3.474    3.266    3.474
 multiply_cannon_sync_h2d         24660 15.4    3.181    3.453    3.181    3.453
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.410    3.450
 pw_transfer                       1331 11.6    0.065    0.072    3.379    3.408
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    2.200    3.318
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.273    3.303
 make_images_data                  4110 15.4    0.047    0.050    2.890    3.267
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    3.171    3.225
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.200    3.202
 hybrid_alltoall_any               4261 16.3    0.119    0.457    2.616    3.193
 density_rs2pw                      110  9.6    0.004    0.004    2.877    3.092
 calculate_dm_sparse                110  9.5    0.001    0.001    3.042    3.078
 cp_fm_cholesky_invert               11 10.9    3.064    3.072    3.064    3.072
 wfi_extrapolate                     11  7.9    0.001    0.001    3.041    3.041
 calculate_first_density_matrix       1  7.0    0.001    0.003    3.017    3.020
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    2.952    2.954
 cp_fm_redistribute_end              48 14.0    1.478    2.923    1.479    2.924
 cp_fm_diag_elpa_base                48 14.0    1.362    2.778    1.441    2.891
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.784    2.882
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.793    2.829
 mp_alltoall_i22                    605 13.7    1.664    2.826    1.664    2.826
 rs_pw_transfer                     902 11.9    0.010    0.011    2.531    2.819
 fft_wrap_pw1pw2_140                451 13.1    0.201    0.214    2.750    2.783
 fft3d_ps                          1111 14.6    1.063    1.095    2.578    2.596
 grid_collocate_task_list           110  9.6    2.225    2.401    2.225    2.401
 qs_energies_init_hamiltonians       11  5.9    0.001    0.004    2.354    2.361
 mp_alltoall_d11v                  2046 13.8    2.120    2.353    2.120    2.353
 jit_kernel_multiply                 10 15.6    1.390    2.222    1.390    2.222
 potential_pw2rs                    110 12.3    0.012    0.013    2.178    2.191
 cp_fm_cholesky_decompose            22 10.9    1.799    1.854    1.799    1.854
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    1.758    1.788
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.001    1.612    1.722
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.646    1.658
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.626    1.645
 acc_transpose_blocks             24660 15.4    0.106    0.112    1.518    1.540
 mp_sum_l                          6514 12.8    1.079    1.495    1.079    1.495
 multiply_cannon_metrocomm4       20550 15.4    0.057    0.061    0.876    1.461
 mp_waitany                       13376 13.8    1.107    1.445    1.107    1.445
 qs_env_update_s_mstruct             11  6.9    0.003    0.017    1.282    1.412
 mp_allgather_i34                  2055 14.4    0.530    1.379    0.530    1.379
 mp_alltoall_z22v                  1111 16.6    1.299    1.357    1.299    1.357
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="206", plot="h2o_128_md", label="(8n/3r/4t)", y=67.202000, yerr=0.000000
PlotPoint: name="207", plot="h2o_128_md_mem", label="(8n/3r/4t)", y=696.363636, yerr=11.088673
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/13/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    807.299199E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1438408       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     684.2
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             844.828672E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  197280
 MPI messages size (bytes):
  total size                       339.125567E+09
  min size                           0.000000E+00
  max size                          13.107200E+06
  average size                       1.719006E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 132                  4325376
     32768 < size <=   131072               88656              11620319232
    131072 < size <=  4194304               89424             117209825280
   4194304 < size <= 16777216               17616             210291069504
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         7346                     33.
 MP_Alltoall          8043                 263767.
 MP_ISend            32836                 654203.
 MP_IRecv            32836                 654587.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3521                  65574.
 MP_Allreduce         9838                    640.
 MP_Sync               100
 MP_Alltoall          1496                8504061.
 MP_SendRecv          3300                  54848.
 MP_ISendRecv         3300                  54848.
 MP_Wait             13926
 MP_comm_split          48
 MP_ISend             9240                 278857.
 MP_IRecv             9240                 278857.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.019    0.047   57.028   57.028
 qs_mol_dyn_low                       1  2.0    0.003    0.003   56.701   56.710
 qs_forces                           11  3.9    0.002    0.002   56.634   56.635
 qs_energies                         11  4.9    0.001    0.001   52.964   52.969
 scf_env_do_scf                      11  5.9    0.000    0.001   44.423   44.423
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.007   36.257   36.257
 velocity_verlet                     10  3.0    0.002    0.002   31.736   31.739
 dbcsr_multiply_generic            2055 12.4    0.106    0.108   23.749   23.897
 qs_scf_new_mos                      99  7.5    0.001    0.001   21.641   21.700
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   21.640   21.700
 ot_scf_mini                         99  9.5    0.002    0.002   20.375   20.420
 multiply_cannon                   2055 13.4    0.247    0.271   17.902   19.352
 multiply_cannon_loop              2055 14.4    0.319    0.330   16.475   16.882
 rebuild_ks_matrix                  110  8.3    0.000    0.000   11.754   11.788
 qs_ks_build_kohn_sham_matrix       110  9.3    0.012    0.013   11.754   11.787
 ot_mini                             99 10.5    0.001    0.001   10.947   10.993
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.458   10.490
 multiply_cannon_multrec           8220 15.4    3.234    4.722    7.570    8.765
 mp_waitall_1                    103326 16.6    6.718    8.417    6.718    8.417
 init_scf_loop                       11  6.9    0.000    0.000    8.118    8.119
 qs_ot_get_derivative                99 11.5    0.001    0.001    7.028    7.069
 prepare_preconditioner              11  7.9    0.000    0.000    6.472    6.479
 make_preconditioner                 11  8.9    0.000    0.000    6.472    6.478
 sum_up_and_integrate               110 10.3    0.080    0.081    6.406    6.417
 integrate_v_rspace                 110 11.3    0.003    0.003    6.326    6.337
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.026    6.110
 qs_rho_update_rho_low              110  7.6    0.001    0.001    5.890    5.906
 calculate_rho_elec                 110  8.6    0.115    0.115    5.890    5.906
 qs_ot_get_p                        110 10.4    0.001    0.001    5.405    5.457
 init_scf_run                        11  5.9    0.000    0.001    5.347    5.347
 scf_env_initial_rho_setup           11  6.9    0.001    0.002    5.347    5.347
 dbcsr_mm_accdrv_process          17442 15.9    2.802    3.919    4.206    5.110
 make_m2s                          4110 13.4    0.040    0.041    4.450    4.689
 multiply_cannon_metrocomm3        8220 15.4    0.018    0.018    3.184    4.657
 make_images                       4110 14.4    0.638    0.698    4.318    4.556
 qs_ot_p2m_diag                      48 11.0    0.081    0.084    3.933    3.937
 ot_diis_step                        99 11.5    0.012    0.012    3.889    3.890
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    3.781    3.834
 apply_single                       110 13.6    0.000    0.000    3.780    3.834
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    3.605    3.606
 grid_integrate_task_list           110 12.3    3.363    3.523    3.363    3.523
 pw_transfer                       1331 11.6    0.065    0.073    3.446    3.472
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.009    3.338    3.367
 cp_fm_cholesky_invert               11 10.9    3.214    3.218    3.214    3.218
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.062    3.063
 cp_fm_redistribute_end              48 14.0    0.779    3.032    0.783    3.033
 multiply_cannon_sync_h2d          8220 15.4    2.893    3.013    2.893    3.013
 cp_fm_diag_elpa_base                48 14.0    2.063    2.830    2.241    2.998
 make_images_data                  4110 15.4    0.039    0.044    2.553    2.987
 density_rs2pw                      110  9.6    0.004    0.004    2.876    2.957
 hybrid_alltoall_any               4261 16.3    0.199    0.864    2.437    2.879
 fft_wrap_pw1pw2_140                451 13.1    0.212    0.215    2.843    2.877
 wfi_extrapolate                     11  7.9    0.001    0.001    2.733    2.733
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.718    2.719
 qs_energies_init_hamiltonians       11  5.9    0.006    0.009    2.669    2.670
 fft3d_ps                          1111 14.6    1.119    1.176    2.597    2.627
 calculate_dm_sparse                110  9.5    0.001    0.001    2.492    2.536
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.528    2.529
 rs_pw_transfer                     902 11.9    0.010    0.011    2.364    2.481
 grid_collocate_task_list           110  9.6    2.329    2.447    2.329    2.447
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.344    2.360
 mp_alltoall_d11v                  2046 13.8    1.894    2.075    1.894    2.075
 potential_pw2rs                    110 12.3    0.015    0.015    2.048    2.060
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    2.029    2.051
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    1.976    1.998
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    1.773    1.994
 mp_allgather_i34                  2055 14.4    0.613    1.788    0.613    1.788
 cp_fm_cholesky_decompose            22 10.9    1.760    1.770    1.760    1.770
 jit_kernel_multiply                  9 15.5    1.089    1.724    1.089    1.724
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.630    1.633
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    1.504    1.623
 dbcsr_complete_redistribute        325 12.2    0.555    0.576    1.533    1.620
 multiply_cannon_metrocomm1        8220 15.4    0.021    0.022    1.055    1.495
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.477    1.490
 qs_create_task_list                 11  7.9    0.001    0.001    1.229    1.326
 generate_qs_task_list               11  8.9    0.379    0.449    1.228    1.325
 mp_alltoall_z22v                  1111 16.6    1.245    1.268    1.245    1.268
 mp_waitany                        9240 13.8    1.108    1.232    1.108    1.232
 multiply_cannon_metrocomm4        6165 15.4    0.018    0.019    0.485    1.214
 copy_dbcsr_to_fm                   151 11.3    0.003    0.003    1.177    1.210
 mp_irecv_dv                      24056 15.7    0.460    1.173    0.460    1.173
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="208", plot="h2o_128_md", label="(8n/2r/6t)", y=57.028000, yerr=0.000000
PlotPoint: name="209", plot="h2o_128_md_mem", label="(8n/2r/6t)", y=791.181818, yerr=14.788313
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/14/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.612391E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1464624       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     672.0
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank               1.339625E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   82200
 MPI messages size (bytes):
  total size                       297.640985E+09
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       3.620936E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                  44                  1441792
     32768 < size <=   131072               18560               2432696320
    131072 < size <=  4194304               54216              84915781632
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            8808             210291069504
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3462                  67104.
 MP_Allreduce         9672                    819.
 MP_Sync                52
 MP_Alltoall          1474               16505187.
 MP_SendRecv          2310                 360267.
 MP_ISendRecv         2310                 360267.
 MP_Wait              5214
 MP_ISend             2420                1187840.
 MP_IRecv             2420                1187840.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.084    0.119   91.801   91.802
 qs_mol_dyn_low                       1  2.0    0.003    0.004   91.326   91.336
 qs_forces                           11  3.9    0.017    0.033   91.225   91.226
 qs_energies                         11  4.9    0.001    0.001   87.089   87.106
 scf_env_do_scf                      11  5.9    0.001    0.001   76.287   76.288
 velocity_verlet                     10  3.0    0.012    0.015   58.730   58.739
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.007   46.261   46.261
 dbcsr_multiply_generic            2055 12.4    0.127    0.135   30.746   30.958
 init_scf_loop                       11  6.9    0.000    0.000   29.946   29.947
 qs_scf_new_mos                      99  7.5    0.001    0.001   28.856   28.942
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   28.855   28.942
 prepare_preconditioner              11  7.9    0.000    0.000   27.886   27.897
 make_preconditioner                 11  8.9    0.000    0.000   27.885   27.897
 make_full_inverse_cholesky          11  9.9    0.000    0.000   21.989   27.200
 ot_scf_mini                         99  9.5    0.002    0.002   27.022   27.094
 multiply_cannon                   2055 13.4    0.349    0.370   22.718   23.479
 multiply_cannon_loop              2055 14.4    0.340    0.343   20.733   21.236
 cp_fm_upper_to_full                 70 14.2   12.706   18.158   12.706   18.158
 ot_mini                             99 10.5    0.001    0.001   15.027   15.101
 rebuild_ks_matrix                  110  8.3    0.001    0.001   13.837   13.924
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.014   13.837   13.924
 qs_ks_update_qs_env                110  7.6    0.001    0.001   12.612   12.693
 dbcsr_complete_redistribute        325 12.2    1.030    1.064    7.673   10.900
 mp_waitall_1                     84994 16.7    9.107   10.465    9.107   10.465
 qs_ot_get_derivative                99 11.5    0.001    0.001    9.847    9.924
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    6.592    9.814
 multiply_cannon_multrec           8220 15.4    4.390    4.604    9.624    9.718
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    5.881    9.073
 mp_alltoall_i22                    605 13.7    5.402    8.654    5.402    8.654
 sum_up_and_integrate               110 10.3    0.150    0.151    7.051    7.065
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.939    6.972
 calculate_rho_elec                 110  8.6    0.227    0.227    6.939    6.972
 integrate_v_rspace                 110 11.3    0.003    0.003    6.900    6.914
 make_m2s                          4110 13.4    0.044    0.045    5.937    6.568
 qs_ot_get_p                        110 10.4    0.001    0.001    6.428    6.553
 cp_fm_cholesky_invert               11 10.9    6.463    6.468    6.463    6.468
 make_images                       4110 14.4    0.880    0.934    5.747    6.377
 init_scf_run                        11  5.9    0.000    0.001    6.314    6.314
 scf_env_initial_rho_setup           11  6.9    0.009    0.017    6.314    6.314
 multiply_cannon_metrocomm3        8220 15.4    0.018    0.019    5.735    6.233
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    5.089    5.533
 apply_single                       110 13.6    0.000    0.000    5.088    5.533
 dbcsr_mm_accdrv_process          11614 15.7    3.243    3.666    5.092    5.361
 ot_diis_step                        99 11.5    0.015    0.016    5.141    5.143
 qs_ot_p2m_diag                      48 11.0    0.151    0.156    4.585    4.591
 pw_transfer                       1331 11.6    0.074    0.074    4.172    4.175
 make_images_data                  4110 15.4    0.042    0.045    3.301    4.121
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.098    4.099
 hybrid_alltoall_any               4261 16.3    0.255    0.551    3.186    4.062
 fft_wrap_pw1pw2                   1111 12.6    0.009    0.009    4.055    4.059
 multiply_cannon_sync_h2d          8220 15.4    3.956    3.961    3.956    3.961
 qs_energies_init_hamiltonians       11  5.9    0.001    0.002    3.804    3.822
 grid_integrate_task_list           110 12.3    3.673    3.727    3.673    3.727
 wfi_extrapolate                     11  7.9    0.001    0.001    3.694    3.694
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.081    3.535
 fft_wrap_pw1pw2_140                451 13.1    0.215    0.217    3.431    3.435
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.426    3.427
 cp_fm_diag_elpa_base                48 14.0    2.891    3.090    3.425    3.425
 density_rs2pw                      110  9.6    0.004    0.004    3.335    3.363
 fft3d_ps                          1111 14.6    1.269    1.279    3.266    3.271
 calculate_dm_sparse                110  9.5    0.001    0.001    3.234    3.259
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.146    3.148
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.663    2.699
 grid_collocate_task_list           110  9.6    2.640    2.678    2.640    2.678
 rs_pw_transfer                     902 11.9    0.010    0.011    2.533    2.635
 mp_alltoall_d11v                  2046 13.8    2.533    2.585    2.533    2.585
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.514    2.544
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.503    2.511
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    2.327    2.437
 potential_pw2rs                    110 12.3    0.021    0.022    2.360    2.375
 cp_fm_cholesky_decompose            22 10.9    2.317    2.346    2.317    2.346
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    2.094    2.184
 qs_create_task_list                 11  7.9    0.001    0.001    1.935    1.980
 generate_qs_task_list               11  8.9    0.739    0.793    1.935    1.980
 mp_allgather_i34                  2055 14.4    0.766    1.967    0.766    1.967
 copy_dbcsr_to_fm                   151 11.3    0.003    0.003    1.856    1.914
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.835    1.842
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="210", plot="h2o_128_md", label="(8n/1r/12t)", y=91.802000, yerr=0.000000
PlotPoint: name="211", plot="h2o_128_md_mem", label="(8n/1r/12t)", y=1201.818182, yerr=62.026121
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/15/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.094965E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks              11950464       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     565.3
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             631.545856E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                10348896
 MPI messages size (bytes):
  total size                         4.491514E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     434.009000E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               65736                        0
       128 < size <=     8192                1232                 10092544
      8192 < size <=    32768             3576680              95640223744
     32768 < size <=   131072             1294784              74079797248
    131072 < size <=  4194304             5148576            3175954870160
   4194304 < size <= 16777216              261888            1145794321408
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4075                  56898.
 MP_Allreduce        11228                    786.
 MP_Sync               170
 MP_Alltoall          2226                2529110.
 MP_SendRecv         24320                  18752.
 MP_ISendRecv        24320                  18752.
 MP_Wait             42476
 MP_comm_split          83
 MP_ISend            16020                 108028.
 MP_IRecv            16020                 108028.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.015    0.031  216.117  216.118
 qs_mol_dyn_low                       1  2.0    0.003    0.004  215.483  215.496
 qs_forces                           11  3.9    0.008    0.014  215.388  215.388
 qs_energies                         11  4.9    0.001    0.001  209.468  209.485
 scf_env_do_scf                      11  5.9    0.001    0.001  192.060  192.064
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.007  169.072  169.075
 velocity_verlet                     10  3.0    0.008    0.020  130.079  130.080
 dbcsr_multiply_generic            2507 12.6    0.178    0.184  127.816  128.534
 qs_scf_new_mos                     117  7.6    0.001    0.001  127.374  127.696
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001  127.373  127.695
 ot_scf_mini                        117  9.6    0.003    0.003  120.641  120.971
 multiply_cannon                   2507 13.6    0.237    0.245  102.439  104.869
 multiply_cannon_loop              2507 14.6    2.085    2.179   99.753  102.301
 ot_mini                            117 10.6    0.001    0.001   68.676   69.023
 qs_ot_get_derivative               117 11.6    0.001    0.001   43.177   43.485
 multiply_cannon_multrec          60168 15.6   33.366   34.815   41.627   43.002
 rebuild_ks_matrix                  128  8.3    0.001    0.001   35.278   35.616
 qs_ks_build_kohn_sham_matrix       128  9.3    0.015    0.017   35.278   35.616
 mp_waitall_1                    267128 16.5   30.908   33.879   30.908   33.879
 qs_ks_update_qs_env                128  7.6    0.001    0.001   31.595   31.889
 qs_ot_get_p                        128 10.4    0.001    0.001   29.601   29.920
 multiply_cannon_sync_h2d         60168 15.6   27.355   29.149   27.355   29.149
 apply_preconditioner_dbcsr         128 12.6    0.000    0.001   24.703   25.529
 apply_single                       128 13.6    0.001    0.001   24.703   25.529
 ot_diis_step                       117 11.6    0.007    0.008   25.055   25.058
 init_scf_loop                       11  6.9    0.000    0.000   22.910   22.912
 qs_ot_p2m_diag                      83 11.4    0.077    0.091   21.284   21.360
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002   20.255   20.499
 cp_dbcsr_syevd                      83 12.4    0.004    0.005   18.651   18.658
 prepare_preconditioner              11  7.9    0.000    0.000   18.066   18.115
 make_preconditioner                 11  8.9    0.000    0.000   18.066   18.115
 multiply_cannon_metrocomm3       60168 15.6    0.116    0.121   16.018   18.011
 make_full_inverse_cholesky          11  9.9    0.000    0.000   17.257   17.450
 make_m2s                          5014 13.6    0.106    0.115   15.527   15.925
 sum_up_and_integrate               128 10.3    0.089    0.107   15.733   15.751
 make_images                       5014 14.6    0.403    0.422   15.343   15.750
 integrate_v_rspace                 128 11.3    0.003    0.004   15.644   15.666
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   14.760   14.804
 cp_fm_redistribute_end              83 14.4   11.650   14.672   11.673   14.689
 cp_fm_diag_elpa_base                83 14.4    2.938   14.196    2.969   14.301
 qs_rho_update_rho_low              128  7.7    0.001    0.001   14.107   14.197
 calculate_rho_elec                 128  8.7    0.045    0.063   14.107   14.196
 init_scf_run                        11  5.9    0.000    0.001   12.713   12.713
 scf_env_initial_rho_setup           11  6.9    0.002    0.002   12.712   12.713
 cp_fm_cholesky_invert               11 10.9   10.310   10.322   10.310   10.322
 mp_sum_l                          7870 13.0    8.587    9.978    8.587    9.978
 wfi_extrapolate                     11  7.9    0.001    0.001    9.083    9.083
 multiply_cannon_metrocomm1       60168 15.6    0.088    0.093    6.856    8.792
 calculate_dm_sparse                128  9.5    0.001    0.001    8.529    8.647
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    8.482    8.613
 make_images_data                  5014 15.6    0.066    0.070    7.518    8.413
 pw_transfer                       1547 11.6    0.075    0.106    8.036    8.378
 dbcsr_mm_accdrv_process         124484 16.2    3.167    3.403    7.824    8.364
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.013    7.832    8.151
 density_rs2pw                      128  9.7    0.006    0.007    7.671    8.143
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    7.910    8.005
 rs_pw_transfer                    1046 11.9    0.016    0.018    7.244    7.764
 hybrid_alltoall_any               5200 16.5    0.287    2.244    6.384    7.555
 grid_integrate_task_list           128 12.3    7.021    7.494    7.021    7.494
 fft3d_ps                          1291 14.7    2.093    2.579    6.628    6.894
 cp_dbcsr_sm_fm_multiply             37  9.5    0.003    0.003    6.754    6.763
 mp_alltoall_d11v                  2415 14.1    5.203    6.480    5.203    6.480
 fft_wrap_pw1pw2_140                523 13.2    0.443    0.506    6.133    6.335
 potential_pw2rs                    128 12.3    0.009    0.010    5.863    5.932
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    5.801    5.872
 cp_fm_cholesky_decompose            22 10.9    5.457    5.472    5.457    5.472
 grid_collocate_task_list           128  9.7    4.723    5.066    4.723    5.066
 mp_sum_d                          4455 12.2    4.033    4.880    4.033    4.880
 make_images_sizes                 5014 15.6    0.006    0.007    2.445    4.356
 mp_alltoall_i44                   5014 16.6    2.439    4.350    2.439    4.350
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="400", plot="h2o_256_md", label="(8n/12r/1t)", y=216.118000, yerr=0.000000
PlotPoint: name="401", plot="h2o_256_md_mem", label="(8n/12r/1t)", y=595.909091, yerr=8.140299
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/16/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1430460020736       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1958505086976       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1986244964352       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1992000282624       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2753956716544       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4454954827776       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5444944789504       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5492290093056       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6712799002624       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11613089636352       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15239146475520       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15239146475520       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19911124992000       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        94.228663E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.199914E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6806316384       0.0%      0.0%    100.0%
 number of processed stacks               6022464       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.2
 marketing flops                   145.647559E+12
 -------------------------------------------------------------------------------
 # multiplications                           2527
 max memory usage/rank             833.388544E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2425920
 MPI messages size (bytes):
  total size                         4.132350E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.703416E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               71436               2336489472
     32768 < size <=   131072              728832              55956209664
    131072 < size <=  4194304             1386864            1409906900992
   4194304 < size <= 16777216              155760            1473826772352
  16777216 < size                           68112            1190343475200
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4107                  56904.
 MP_Allreduce        11307                    944.
 MP_Sync               170
 MP_Alltoall          1983                5140744.
 MP_SendRecv         12126                  47072.
 MP_ISendRecv        12126                  47072.
 MP_Wait             26114
 MP_comm_split          83
 MP_ISend            11836                 212447.
 MP_IRecv            11836                 212447.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.095    0.253  207.772  207.781
 qs_mol_dyn_low                       1  2.0    0.004    0.021  206.508  206.521
 qs_forces                           11  3.9    0.006    0.008  206.251  206.252
 qs_energies                         11  4.9    0.011    0.034  199.365  199.383
 scf_env_do_scf                      11  5.9    0.004    0.012  180.754  180.765
 scf_env_do_scf_inner_loop          118  6.6    0.003    0.008  144.459  144.462
 velocity_verlet                     10  3.0    0.019    0.044  129.547  129.552
 qs_scf_new_mos                     118  7.6    0.001    0.001  103.267  103.727
 qs_scf_loop_do_ot                  118  8.6    0.001    0.001  103.266  103.727
 dbcsr_multiply_generic            2527 12.6    0.188    0.195  102.165  103.485
 ot_scf_mini                        118  9.6    0.015    0.035   98.238   98.678
 multiply_cannon                   2527 13.6    0.483    0.531   79.591   84.972
 multiply_cannon_loop              2527 14.6    1.263    1.298   75.580   78.543
 ot_mini                            118 10.6    0.001    0.001   53.327   53.777
 mp_waitall_1                    216438 16.6   27.700   42.787   27.700   42.787
 multiply_cannon_multrec          30324 15.6   21.709   26.121   31.387   36.170
 init_scf_loop                       11  6.9    0.003    0.008   36.145   36.146
 rebuild_ks_matrix                  129  8.3    0.001    0.001   34.510   34.954
 qs_ks_build_kohn_sham_matrix       129  9.3    0.017    0.020   34.509   34.953
 qs_ks_update_qs_env                129  7.6    0.001    0.001   31.157   31.579
 prepare_preconditioner              11  7.9    0.000    0.000   31.409   31.470
 make_preconditioner                 11  8.9    0.000    0.002   31.409   31.470
 qs_ot_get_derivative               118 11.6    0.001    0.002   30.599   31.035
 multiply_cannon_metrocomm3       30324 15.6    0.095    0.102   17.427   30.913
 make_full_inverse_cholesky          11  9.9    0.000    0.000   29.999   30.558
 qs_ot_get_p                        129 10.4    0.001    0.001   25.969   26.558
 apply_preconditioner_dbcsr         129 12.6    0.000    0.001   22.520   23.758
 apply_single                       129 13.6    0.001    0.001   22.519   23.758
 ot_diis_step                       118 11.6    0.014    0.015   22.509   22.511
 multiply_cannon_sync_h2d         30324 15.6   19.024   21.080   19.024   21.080
 qs_ot_p2m_diag                      83 11.4    0.187    0.215   20.447   20.497
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   19.122   19.123
 cp_fm_cholesky_invert               11 10.9   18.385   18.401   18.385   18.401
 make_m2s                          5054 13.6    0.093    0.098   15.814   17.800
 make_images                       5054 14.6    1.169    1.353   15.600   17.588
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   15.603   15.614
 sum_up_and_integrate               129 10.3    0.117    0.136   15.497   15.525
 cp_fm_redistribute_end              83 14.4    9.093   15.472    9.108   15.474
 integrate_v_rspace                 129 11.3    0.003    0.003   15.380   15.412
 cp_fm_diag_elpa_base                83 14.4    6.108   14.872    6.339   15.251
 qs_rho_update_rho_low              129  7.7    0.001    0.001   14.097   14.189
 calculate_rho_elec                 129  8.7    0.088    0.106   14.096   14.189
 init_scf_run                        11  5.9    0.000    0.001   13.193   13.195
 scf_env_initial_rho_setup           11  6.9    0.035    0.039   13.193   13.195
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002   12.219   12.556
 make_images_data                  5054 15.6    0.065    0.071    9.478   11.815
 hybrid_alltoall_any               5240 16.5    0.344    1.525    8.028   10.881
 multiply_cannon_metrocomm4       27797 15.6    0.095    0.110    3.864   10.563
 mp_irecv_dv                      70031 16.3    3.670   10.179    3.670   10.179
 dbcsr_mm_accdrv_process          62734 16.2    4.597    5.675    9.162    9.840
 wfi_extrapolate                     11  7.9    0.001    0.001    8.965    8.965
 pw_transfer                       1559 11.6    0.086    0.099    8.488    8.570
 fft_wrap_pw1pw2                   1301 12.7    0.010    0.011    8.262    8.342
 density_rs2pw                      129  9.7    0.006    0.006    7.659    8.146
 cp_fm_cholesky_decompose            22 10.9    8.002    8.080    8.002    8.080
 qs_ot_get_derivative_taylor         41 13.0    0.001    0.001    6.933    7.700
 rs_pw_transfer                    1054 12.0    0.014    0.016    6.973    7.687
 grid_integrate_task_list           129 12.3    7.183    7.605    7.183    7.605
 mp_sum_l                          7930 13.1    5.062    7.266    5.062    7.266
 fft_wrap_pw1pw2_140                527 13.2    0.469    0.515    6.973    7.059
 calculate_dm_sparse                129  9.5    0.001    0.002    6.705    6.883
 fft3d_ps                          1301 14.7    2.799    2.965    6.624    6.668
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.488    6.499
 mp_allgather_i34                  2527 14.6    2.570    6.044    2.570    6.044
 potential_pw2rs                    129 12.3    0.014    0.016    5.735    5.759
 qs_ot_get_orbitals                 118 10.6    0.001    0.001    5.652    5.726
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    5.401    5.539
 mp_alltoall_d11v                  2423 14.1    4.844    5.477    4.844    5.477
 grid_collocate_task_list           129  9.7    4.951    5.281    4.951    5.281
 mp_sum_d                          4489 12.2    3.618    5.248    3.618    5.248
 dbcsr_dot_sd                      1330 12.0    0.778    0.858    3.199    4.596
 dbcsr_complete_redistribute        395 12.7    0.805    0.910    3.595    4.509
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="402", plot="h2o_256_md", label="(8n/6r/2t)", y=207.781000, yerr=0.000000
PlotPoint: name="403", plot="h2o_256_md_mem", label="(8n/6r/2t)", y=793.636364, yerr=2.900556
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/17/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.928533E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks               3984192       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1695.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             938.401792E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                 1042912
 MPI messages size (bytes):
  total size                         2.716210E+12
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       2.604448E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 264                  8650752
     32768 < size <=   131072              281856              36943429632
    131072 < size <=  4194304              660064             996105256960
   4194304 < size <= 16777216               65632             931530938576
  16777216 < size                           28672             751619276800
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4075                  57335.
 MP_Allreduce        11226                    986.
 MP_Sync               170
 MP_Alltoall          1712                9388896.
 MP_SendRecv          7936                  75008.
 MP_ISendRecv         7936                  75008.
 MP_Wait             21820
 MP_comm_split          83
 MP_ISend            11748                 275205.
 MP_IRecv            11748                 275205.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.074    0.099  183.735  183.738
 qs_mol_dyn_low                       1  2.0    0.003    0.005  183.066  183.082
 qs_forces                           11  3.9    0.004    0.009  182.960  182.964
 qs_energies                         11  4.9    0.006    0.021  176.287  176.298
 scf_env_do_scf                      11  5.9    0.001    0.001  159.733  159.733
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  122.316  122.317
 velocity_verlet                     10  3.0    0.015    0.018  117.578  117.580
 qs_scf_new_mos                     117  7.6    0.001    0.001   85.147   85.484
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   85.146   85.483
 dbcsr_multiply_generic            2507 12.6    0.191    0.208   83.291   84.480
 ot_scf_mini                        117  9.6    0.004    0.005   80.891   81.281
 multiply_cannon                   2507 13.6    0.497    0.527   62.549   67.241
 multiply_cannon_loop              2507 14.6    0.857    0.887   59.205   61.953
 ot_mini                            117 10.6    0.001    0.001   43.801   44.203
 init_scf_loop                       11  6.9    0.004    0.018   37.299   37.299
 mp_waitall_1                    170520 16.6   26.183   35.532   26.183   35.532
 prepare_preconditioner              11  7.9    0.000    0.000   33.140   33.187
 make_preconditioner                 11  8.9    0.007    0.031   33.140   33.187
 make_full_inverse_cholesky          11  9.9    0.000    0.000   30.739   32.185
 rebuild_ks_matrix                  128  8.3    0.001    0.001   30.647   31.160
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.029   30.647   31.160
 qs_ks_update_qs_env                128  7.6    0.001    0.001   27.615   28.092
 multiply_cannon_metrocomm3       20056 15.6    0.061    0.064   15.812   25.156
 multiply_cannon_multrec          20056 15.6   13.392   16.397   22.088   25.118
 qs_ot_get_derivative               117 11.6    0.001    0.002   23.908   24.303
 qs_ot_get_p                        128 10.4    0.001    0.001   22.160   22.591
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   19.986   20.830
 apply_single                       128 13.6    0.001    0.001   19.986   20.830
 ot_diis_step                       117 11.6    0.018    0.019   19.767   19.768
 qs_ot_p2m_diag                      83 11.4    0.265    0.274   17.712   17.725
 cp_dbcsr_syevd                      83 12.4    0.005    0.005   16.603   16.604
 make_m2s                          5014 13.6    0.084    0.089   15.370   16.457
 make_images                       5014 14.6    1.185    1.279   15.134   16.217
 cp_fm_cholesky_invert               11 10.9   15.691   15.700   15.691   15.700
 multiply_cannon_sync_h2d         20056 15.6   14.324   15.652   14.324   15.652
 sum_up_and_integrate               128 10.3    0.133    0.146   14.507   14.529
 integrate_v_rspace                 128 11.3    0.003    0.004   14.373   14.398
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.335   13.374
 calculate_rho_elec                 128  8.7    0.132    0.146   13.335   13.373
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   13.314   13.317
 cp_fm_redistribute_end              83 14.4    5.024   13.214    5.044   13.216
 cp_fm_diag_elpa_base                83 14.4    7.711   12.564    8.140   13.052
 init_scf_run                        11  5.9    0.000    0.001   11.185   11.186
 scf_env_initial_rho_setup           11  6.9    0.001    0.003   11.185   11.185
 make_images_data                  5014 15.6    0.061    0.069    9.405   10.843
 hybrid_alltoall_any               5200 16.5    0.432    1.976    8.110    9.849
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.471    9.755
 multiply_cannon_metrocomm4       17549 15.6    0.061    0.071    3.415    9.253
 mp_irecv_dv                      50230 16.2    3.293    9.012    3.293    9.012
 dbcsr_mm_accdrv_process          41502 16.2    4.381    5.298    8.173    8.373
 cp_fm_cholesky_decompose            22 10.9    8.133    8.168    8.133    8.168
 pw_transfer                       1547 11.6    0.085    0.105    7.894    7.995
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    7.670    7.779
 grid_integrate_task_list           128 12.3    7.341    7.762    7.341    7.762
 cp_fm_upper_to_full                105 14.5    5.841    7.695    5.841    7.695
 wfi_extrapolate                     11  7.9    0.001    0.001    7.685    7.686
 density_rs2pw                      128  9.7    0.006    0.006    6.806    7.225
 dbcsr_complete_redistribute        395 12.7    1.241    1.292    4.894    6.734
 fft_wrap_pw1pw2_140                523 13.2    0.475    0.525    6.558    6.672
 rs_pw_transfer                    1046 11.9    0.014    0.014    5.769    6.193
 calculate_dm_sparse                128  9.5    0.001    0.003    5.941    6.026
 fft3d_ps                          1291 14.7    2.692    2.904    5.939    5.993
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.733    5.737
 grid_collocate_task_list           128  9.7    5.081    5.451    5.081    5.451
 copy_fm_to_dbcsr                   209 11.7    0.002    0.002    3.550    5.393
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.679    5.346
 mp_allgather_i34                  2507 14.6    1.868    5.034    1.868    5.034
 mp_sum_l                          7870 13.0    3.571    4.961    3.571    4.961
 potential_pw2rs                    128 12.3    0.020    0.023    4.855    4.878
 mp_alltoall_d11v                  2415 14.1    4.316    4.826    4.316    4.826
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.616    4.770
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.202    4.234
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    2.374    4.191
 mp_alltoall_i22                    716 14.1    2.075    4.093    2.075    4.093
 qs_energies_init_hamiltonians       11  5.9    0.009    0.022    4.061    4.064
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="404", plot="h2o_256_md", label="(8n/4r/3t)", y=183.738000, yerr=0.000000
PlotPoint: name="405", plot="h2o_256_md_mem", label="(8n/4r/3t)", y=884.818182, yerr=12.582948
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/18/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420242647040       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528903135232       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514766E+12       0.0%      0.0%    100.0%
 flops max/rank                      4.353791E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755941440       0.0%      0.0%    100.0%
 number of processed stacks               5977344       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.3
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.145086E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1143192
 MPI messages size (bytes):
  total size                         2.023815E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.770320E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 396                  8650752
     32768 < size <=   131072              319024              36042702848
    131072 < size <=  4194304              715736             785529176064
   4194304 < size <= 16777216               70320             665379475120
  16777216 < size                           30720             536870912000
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4085                  57194.
 MP_Allreduce        11251                   1067.
 MP_Sync               170
 MP_Alltoall          1712               12503107.
 MP_SendRecv          5888                  75008.
 MP_ISendRecv         5888                  75008.
 MP_Wait             22442
 MP_comm_split          83
 MP_ISend            14952                 244818.
 MP_IRecv            14952                 244818.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.052    0.108  190.428  190.428
 qs_mol_dyn_low                       1  2.0    0.003    0.013  189.678  189.691
 qs_forces                           11  3.9    0.004    0.005  189.513  189.522
 qs_energies                         11  4.9    0.001    0.001  182.464  182.475
 scf_env_do_scf                      11  5.9    0.001    0.001  165.087  165.098
 velocity_verlet                     10  3.0    0.009    0.009  126.087  126.092
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  117.350  117.352
 qs_scf_new_mos                     117  7.6    0.001    0.001   81.121   81.395
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   81.121   81.395
 dbcsr_multiply_generic            2507 12.6    0.190    0.195   80.166   80.751
 ot_scf_mini                        117  9.6    0.003    0.003   76.590   76.846
 multiply_cannon                   2507 13.6    0.554    0.593   54.961   58.040
 multiply_cannon_loop              2507 14.6    1.181    1.208   51.268   52.568
 init_scf_loop                       11  6.9    0.000    0.000   47.597   47.598
 prepare_preconditioner              11  7.9    0.000    0.000   43.502   43.530
 make_preconditioner                 11  8.9    0.000    0.000   43.502   43.529
 ot_mini                            117 10.6    0.001    0.001   42.737   43.008
 make_full_inverse_cholesky          11  9.9    0.000    0.000   37.123   42.077
 multiply_cannon_multrec          30084 15.6   14.097   19.040   26.058   30.750
 rebuild_ks_matrix                  128  8.3    0.001    0.001   29.430   29.629
 qs_ks_build_kohn_sham_matrix       128  9.3    0.016    0.018   29.430   29.629
 mp_waitall_1                    147882 16.7   18.073   28.256   18.073   28.256
 qs_ks_update_qs_env                128  7.6    0.001    0.001   26.617   26.785
 qs_ot_get_derivative               117 11.6    0.001    0.002   23.065   23.319
 make_m2s                          5014 13.6    0.099    0.103   20.814   21.822
 make_images                       5014 14.6    1.971    2.233   20.503   21.509
 qs_ot_get_p                        128 10.4    0.001    0.001   19.938   20.241
 ot_diis_step                       117 11.6    0.017    0.018   19.549   19.551
 apply_preconditioner_dbcsr         128 12.6    0.000    0.001   19.092   19.550
 apply_single                       128 13.6    0.001    0.001   19.091   19.549
 cp_fm_cholesky_invert               11 10.9   17.032   17.042   17.032   17.042
 cp_fm_upper_to_full                105 14.7   10.964   16.255   10.964   16.255
 qs_ot_p2m_diag                      83 11.4    0.342    0.389   15.716   15.768
 multiply_cannon_metrocomm3       30084 15.6    0.047    0.049    6.674   15.306
 cp_dbcsr_syevd                      83 12.4    0.005    0.005   14.406   14.408
 sum_up_and_integrate               128 10.3    0.140    0.152   14.349   14.372
 integrate_v_rspace                 128 11.3    0.003    0.004   14.209   14.236
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.448   13.486
 calculate_rho_elec                 128  8.7    0.175    0.190   13.448   13.485
 dbcsr_complete_redistribute        395 12.7    1.564    1.741    9.252   12.996
 make_images_data                  5014 15.6    0.064    0.067   11.302   12.957
 multiply_cannon_sync_h2d         30084 15.6   11.768   12.511   11.768   12.511
 dbcsr_mm_accdrv_process          62264 16.2    7.273    8.354   11.538   12.045
 hybrid_alltoall_any               5200 16.5    0.523    2.210   10.149   11.942
 copy_fm_to_dbcsr                   209 11.7    0.002    0.002    7.794   11.542
 init_scf_run                        11  5.9    0.000    0.001   11.346   11.348
 scf_env_initial_rho_setup           11  6.9    0.011    0.012   11.346   11.347
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   11.195   11.198
 cp_fm_redistribute_end              83 14.4    1.931   11.096    1.948   11.102
 cp_fm_diag_elpa_base                83 14.4    8.522   10.487    9.117   10.955
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    6.356    9.951
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.631    9.828
 mp_alltoall_i22                    716 14.1    5.552    9.283    5.552    9.283
 pw_transfer                       1547 11.6    0.085    0.101    8.119    8.218
 cp_fm_cholesky_decompose            22 10.9    8.014    8.110    8.014    8.110
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    7.894    7.999
 grid_integrate_task_list           128 12.3    7.526    7.957    7.526    7.957
 wfi_extrapolate                     11  7.9    0.001    0.001    7.785    7.785
 fft_wrap_pw1pw2_140                523 13.2    0.478    0.485    6.911    7.035
 multiply_cannon_metrocomm4       25070 15.6    0.075    0.085    2.765    7.014
 density_rs2pw                      128  9.7    0.006    0.006    6.526    6.847
 mp_irecv_dv                      76098 16.2    2.619    6.752    2.619    6.752
 calculate_dm_sparse                128  9.5    0.001    0.001    6.195    6.275
 fft3d_ps                          1291 14.7    2.796    2.892    6.132    6.209
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.726    5.775
 grid_collocate_task_list           128  9.7    5.200    5.478    5.200    5.478
 mp_alltoall_d11v                  2415 14.1    5.092    5.432    5.092    5.432
 rs_pw_transfer                    1046 11.9    0.013    0.015    4.896    5.219
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.480    4.562
 potential_pw2rs                    128 12.3    0.022    0.023    4.506    4.519
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.437    4.510
 qs_energies_init_hamiltonians       11  5.9    0.001    0.002    4.492    4.494
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.229    4.291
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="406", plot="h2o_256_md", label="(8n/3r/4t)", y=190.428000, yerr=0.000000
PlotPoint: name="407", plot="h2o_256_md_mem", label="(8n/3r/4t)", y=1071.363636, yerr=19.648145
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/19/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.865088E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks               1960712       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3445.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.510957E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  240672
 MPI messages size (bytes):
  total size                         1.331455E+12
  min size                           0.000000E+00
  max size                          52.428800E+06
  average size                       5.532237E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                 132                  8650752
    131072 < size <=  4194304              113904              59718500352
   4194304 < size <= 16777216              104976             550376570880
  16777216 < size                           20208             721350092304
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8931                     51.
 MP_Alltoall          9654                 799394.
 MP_ISend            40068                2102572.
 MP_IRecv            40068                2101675.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3992                  58352.
 MP_Allreduce        10977                   1175.
 MP_Sync                87
 MP_Alltoall          1712               18838210.
 MP_SendRecv          3840                 122880.
 MP_ISendRecv         3840                 122880.
 MP_Wait             16122
 MP_ISend            10680                 423556.
 MP_IRecv            10680                 423556.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.030    0.083  176.541  176.541
 qs_mol_dyn_low                       1  2.0    0.003    0.004  175.913  175.938
 qs_forces                           11  3.9    0.027    0.051  175.783  175.785
 qs_energies                         11  4.9    0.001    0.001  168.367  168.383
 scf_env_do_scf                      11  5.9    0.001    0.001  150.777  150.791
 velocity_verlet                     10  3.0    0.028    0.030  115.022  115.026
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  113.874  113.875
 qs_scf_new_mos                     117  7.6    0.001    0.001   77.820   77.883
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   77.819   77.882
 dbcsr_multiply_generic            2507 12.6    0.183    0.188   75.485   75.921
 ot_scf_mini                        117  9.6    0.003    0.004   73.367   73.423
 multiply_cannon                   2507 13.6    0.587    0.625   55.139   60.143
 multiply_cannon_loop              2507 14.6    0.444    0.458   50.378   51.277
 ot_mini                            117 10.6    0.001    0.001   40.608   40.667
 init_scf_loop                       11  6.9    0.000    0.000   36.742   36.744
 mp_waitall_1                    125778 16.7   26.569   34.147   26.569   34.147
 prepare_preconditioner              11  7.9    0.000    0.000   32.770   32.792
 make_preconditioner                 11  8.9    0.000    0.000   32.770   32.792
 make_full_inverse_cholesky          11  9.9    0.000    0.000   30.619   30.884
 rebuild_ks_matrix                  128  8.3    0.001    0.001   28.899   28.956
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.018   28.898   28.956
 qs_ks_update_qs_env                128  7.6    0.001    0.001   26.357   26.408
 multiply_cannon_multrec          10028 15.6   10.543   14.884   18.008   20.741
 qs_ot_get_derivative               117 11.6    0.001    0.002   20.469   20.527
 ot_diis_step                       117 11.6    0.020    0.020   20.066   20.067
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   19.646   19.925
 apply_single                       128 13.6    0.001    0.001   19.646   19.925
 qs_ot_get_p                        128 10.4    0.001    0.001   19.681   19.740
 multiply_cannon_metrocomm3       10028 15.6    0.023    0.025   12.560   19.485
 make_m2s                          5014 13.6    0.068    0.074   16.600   19.015
 cp_fm_cholesky_invert               11 10.9   18.799   18.805   18.799   18.805
 make_images                       5014 14.6    2.348    2.684   16.291   18.706
 qs_ot_p2m_diag                      83 11.4    0.495    0.501   15.749   15.765
 cp_dbcsr_syevd                      83 12.4    0.005    0.005   14.566   14.568
 sum_up_and_integrate               128 10.3    0.181    0.190   14.263   14.317
 integrate_v_rspace                 128 11.3    0.003    0.004   14.082   14.144
 qs_rho_update_rho_low              128  7.7    0.001    0.001   13.429   13.481
 calculate_rho_elec                 128  8.7    0.258    0.269   13.429   13.481
 make_images_data                  5014 15.6    0.053    0.061   10.041   12.636
 hybrid_alltoall_any               5200 16.5    0.784    3.629    9.800   12.459
 multiply_cannon_sync_h2d         10028 15.6   11.604   11.977   11.604   11.977
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   11.366   11.375
 cp_fm_diag_elpa_base                83 14.4   11.103   11.187   11.352   11.361
 init_scf_run                        11  5.9    0.000    0.001   10.644   10.644
 scf_env_initial_rho_setup           11  6.9    0.001    0.001   10.644   10.644
 cp_fm_cholesky_decompose            22 10.9    8.231    8.353    8.231    8.353
 qs_ot_get_derivative_diag           77 12.4    0.002    0.003    8.175    8.221
 grid_integrate_task_list           128 12.3    7.736    8.141    7.736    8.141
 mp_allgather_i34                  2507 14.6    2.964    8.043    2.964    8.043
 pw_transfer                       1547 11.6    0.084    0.090    8.010    8.037
 dbcsr_mm_accdrv_process          20762 16.1    2.759    3.702    7.093    7.847
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.010    7.790    7.811
 multiply_cannon_metrocomm1       10028 15.6    0.029    0.029    4.582    7.515
 wfi_extrapolate                     11  7.9    0.001    0.001    7.505    7.505
 fft_wrap_pw1pw2_140                523 13.2    0.502    0.523    6.789    6.809
 density_rs2pw                      128  9.7    0.005    0.006    6.184    6.583
 calculate_dm_sparse                128  9.5    0.001    0.001    6.115    6.176
 fft3d_ps                          1291 14.7    2.738    2.835    5.949    5.997
 dbcsr_complete_redistribute        395 12.7    2.134    2.199    5.411    5.832
 grid_collocate_task_list           128  9.7    5.482    5.794    5.482    5.794
 mp_alltoall_d11v                  2415 14.1    5.001    5.733    5.001    5.733
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.333    5.340
 qs_energies_init_hamiltonians       11  5.9    0.009    0.020    5.283    5.292
 rs_pw_transfer                    1046 11.9    0.012    0.013    4.332    4.741
 potential_pw2rs                    128 12.3    0.027    0.027    4.249    4.264
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.137    4.175
 multiply_cannon_metrocomm4        7521 15.6    0.024    0.027    1.862    4.154
 mp_irecv_dv                      28860 15.9    1.824    4.068    1.824    4.068
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    3.572    3.878
 copy_fm_to_dbcsr                   209 11.7    0.002    0.002    3.527    3.875
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    3.754    3.784
 copy_dbcsr_to_fm                   186 11.8    0.004    0.004    3.679    3.750
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="408", plot="h2o_256_md", label="(8n/2r/6t)", y=176.541000, yerr=0.000000
PlotPoint: name="409", plot="h2o_256_md_mem", label="(8n/2r/6t)", y=1412.272727, yerr=51.352774
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/20/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1430456039424       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1962800054272       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1986255912960       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1992003932160       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2753958699008       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4454954827776       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5444944789504       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5492290093056       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6712799002624       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11613072052224       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15239176077312       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15239176077312       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19911132921856       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        94.233020E+12       0.0%      0.0%    100.0%
 flops max/rank                     11.786061E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6806383904       0.0%      0.0%    100.0%
 number of processed stacks               1980288       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3437.1
 marketing flops                   145.650931E+12
 -------------------------------------------------------------------------------
 # multiplications                           2529
 max memory usage/rank               3.037155E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  101160
 MPI messages size (bytes):
  total size                         1.144970E+12
  min size                           0.000000E+00
  max size                         104.857600E+06
  average size                      11.318403E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                  44                  2883584
    131072 < size <=  4194304               45648              35433480192
   4194304 < size <= 16777216               44720             382939955200
  16777216 < size                           10176             726592466352
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4043                  58564.
 MP_Allreduce        11104                   1511.
 MP_Sync                88
 MP_Alltoall          1724               36993632.
 MP_SendRecv          1806                 218624.
 MP_ISendRecv         1806                 218624.
 MP_Wait              9876
 MP_ISend             6456                1080169.
 MP_IRecv             6456                1080169.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.037    0.067  316.454  316.455
 qs_mol_dyn_low                       1  2.0    0.005    0.018  315.503  315.515
 qs_forces                           11  3.9    0.013    0.032  315.402  315.406
 qs_energies                         11  4.9    0.001    0.001  305.724  305.742
 scf_env_do_scf                      11  5.9    0.001    0.001  281.674  281.685
 velocity_verlet                     10  3.0    0.002    0.002  226.032  226.040
 scf_env_do_scf_inner_loop          118  6.6    0.003    0.008  151.855  151.857
 init_scf_loop                       11  6.9    0.000    0.000  129.542  129.544
 prepare_preconditioner              11  7.9    0.000    0.000  124.509  124.534
 make_preconditioner                 11  8.9    0.000    0.000  124.509  124.534
 make_full_inverse_cholesky          11  9.9    0.000    0.000  100.070  121.342
 qs_scf_new_mos                     118  7.6    0.001    0.001  106.036  106.182
 qs_scf_loop_do_ot                  118  8.6    0.001    0.001  106.035  106.181
 ot_scf_mini                        118  9.6    0.004    0.004  100.887  101.059
 dbcsr_multiply_generic            2529 12.6    0.222    0.228   93.580   94.078
 cp_fm_upper_to_full                106 14.8   52.545   75.601   52.545   75.601
 multiply_cannon                   2529 13.6    0.712    0.765   65.452   67.720
 multiply_cannon_loop              2529 14.6    0.471    0.482   61.194   62.362
 ot_mini                            118 10.6    0.001    0.001   51.351   51.546
 dbcsr_complete_redistribute        397 12.7    3.993    4.019   30.606   43.381
 copy_fm_to_dbcsr                   210 11.7    0.002    0.002   26.983   39.740
 mp_waitall_1                    104580 16.8   35.492   39.503   35.492   39.503
 cp_fm_cholesky_invert               11 10.9   37.488   37.497   37.488   37.497
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000   24.394   37.069
 rebuild_ks_matrix                  129  8.3    0.001    0.001   36.614   36.910
 qs_ks_build_kohn_sham_matrix       129  9.3    0.018    0.018   36.614   36.909
 mp_alltoall_i22                    720 14.1   21.955   34.786   21.955   34.786
 qs_ks_update_qs_env                129  7.6    0.001    0.001   33.576   33.780
 qs_ot_get_p                        129 10.4    0.001    0.001   32.372   32.611
 qs_ot_get_derivative               118 11.6    0.002    0.002   27.711   27.874
 qs_ot_p2m_diag                      84 11.4    0.890    0.896   27.068   27.103
 cp_dbcsr_syevd                      84 12.4    0.006    0.006   25.181   25.186
 multiply_cannon_metrocomm3       10116 15.6    0.024    0.024   24.034   25.060
 make_m2s                          5058 13.6    0.080    0.084   22.230   23.569
 ot_diis_step                       118 11.6    0.022    0.023   23.550   23.551
 make_images                       5058 14.6    3.832    4.024   21.742   23.087
 apply_preconditioner_dbcsr         129 12.6    0.000    0.000   22.129   22.627
 apply_single                       129 13.6    0.001    0.001   22.128   22.627
 cp_fm_diag_elpa                     84 13.4    0.000    0.000   21.481   21.482
 cp_fm_diag_elpa_base                84 14.4   17.098   18.679   21.474   21.476
 multiply_cannon_multrec          10116 15.6   10.719   12.592   18.594   18.683
 sum_up_and_integrate               129 10.3    0.324    0.325   17.165   17.270
 qs_rho_update_rho_low              129  7.7    0.001    0.001   16.956   17.050
 calculate_rho_elec                 129  8.7    0.487    0.488   16.955   17.050
 integrate_v_rspace                 129 11.3    0.004    0.004   16.841   16.947
 multiply_cannon_sync_h2d         10116 15.6   15.794   15.805   15.794   15.805
 make_images_data                  5058 15.6    0.060    0.066   12.268   14.699
 hybrid_alltoall_any               5245 16.5    1.309    3.061   12.175   14.448
 init_scf_run                        11  5.9    0.000    0.001   13.716   13.717
 scf_env_initial_rho_setup           11  6.9    0.002    0.003   13.716   13.717
 qs_ot_get_derivative_diag           78 12.4    0.002    0.003   11.183   11.288
 pw_transfer                       1559 11.6    0.092    0.094    9.779    9.797
 cp_fm_cholesky_decompose            22 10.9    9.728    9.741    9.728    9.741
 wfi_extrapolate                     11  7.9    0.001    0.001    9.724    9.725
 fft_wrap_pw1pw2                   1301 12.7    0.011    0.011    9.542    9.559
 dbcsr_mm_accdrv_process          20934 16.1    3.947    5.831    7.633    9.495
 mp_alltoall_d11v                  2429 14.1    8.968    9.119    8.968    9.119
 grid_integrate_task_list           129 12.3    8.611    8.807    8.611    8.807
 qs_energies_init_hamiltonians       11  5.9    0.074    0.125    8.287    8.300
 fft_wrap_pw1pw2_140                527 13.2    0.544    0.547    8.284    8.299
 fft3d_ps                          1301 14.7    2.786    2.813    7.569    7.591
 calculate_dm_sparse                129  9.5    0.001    0.001    7.356    7.496
 density_rs2pw                      129  9.7    0.005    0.005    7.303    7.333
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    7.007    7.079
 copy_dbcsr_to_fm                   187 11.8    0.004    0.004    6.477    6.540
 grid_collocate_task_list           129  9.7    6.392    6.426    6.392    6.426
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="410", plot="h2o_256_md", label="(8n/1r/12t)", y=316.455000, yerr=0.000000
PlotPoint: name="411", plot="h2o_256_md_mem", label="(8n/1r/12t)", y=2735.454545, yerr=161.334415
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/21/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.766000E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                419739       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   22952.9
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               1.261392E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  458208
 MPI messages size (bytes):
  total size                         3.456111E+12
  min size                           0.000000E+00
  max size                          18.735064E+06
  average size                       7.542668E+06
 MPI breakdown and total messages size (bytes):
             size <=      128              112896                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 224                  5687808
     32768 < size <=   131072               10528                813356544
    131072 < size <=  4194304               36422              76284728544
   4194304 < size <= 16777216              294266            3312457683808
  16777216 < size                            3872              66548597808
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 255669.
 MP_Allreduce         3059                   6274.
 MP_Sync                 4
 MP_Alltoall            54
 MP_SendRecv           285                  19200.
 MP_ISendRecv          285                  19200.
 MP_Wait              1017
 MP_ISend              642                 197829.
 MP_IRecv              642                 197607.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.052    0.211   86.148   86.161
 qs_energies                          1  2.0    0.000    0.000   85.327   85.341
 ls_scf                               1  3.0    0.000    0.000   84.418   84.431
 dbcsr_multiply_generic             111  6.7    0.014    0.015   73.091   73.239
 multiply_cannon                    111  7.7    0.018    0.021   56.005   57.358
 multiply_cannon_loop               111  8.7    0.208    0.219   52.516   53.976
 ls_scf_main                          1  4.0    0.000    0.000   52.883   52.884
 density_matrix_trs4                  2  5.0    0.002    0.003   47.194   47.281
 ls_scf_init_scf                      1  4.0    0.000    0.000   28.476   28.478
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   27.326   27.387
 mp_waitall_1                     11031 10.9   23.179   25.720   23.179   25.720
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   25.105   25.121
 multiply_cannon_multrec           2664  9.7    8.094    8.787   15.482   17.125
 multiply_cannon_sync_h2d          2664  9.7   13.407   15.448   13.407   15.448
 make_m2s                           222  7.7    0.008    0.011   13.467   13.968
 make_images                        222  8.7    0.099    0.109   13.446   13.950
 multiply_cannon_metrocomm1        2664  9.7    0.010    0.010   10.012   12.471
 make_images_data                   222  9.7    0.004    0.005    7.999    8.603
 hybrid_alltoall_any                227 10.6    0.214    1.822    6.820    8.578
 multiply_cannon_metrocomm3        2664  9.7    0.009    0.010    5.487    8.244
 dbcsr_mm_accdrv_process           4760 10.4    0.509    0.629    7.004    7.975
 dbcsr_mm_accdrv_process_sort      4760 11.4    6.296    7.209    6.296    7.209
 calculate_norms                   4752  9.8    5.481    6.099    5.481    6.099
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.108    5.215
 mp_sum_l                           807  5.4    3.227    4.473    3.227    4.473
 make_images_sizes                  222  9.7    0.000    0.000    0.861    3.705
 mp_alltoall_i44                    222 10.7    0.861    3.705    0.861    3.705
 arnoldi_extremal                     4  6.8    0.000    0.000    3.400    3.419
 arnoldi_normal_ev                    4  7.8    0.001    0.003    3.400    3.419
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    2.288    3.366
 build_subspace                      16  8.4    0.009    0.012    3.280    3.282
 multiply_cannon_metrocomm4        2442  9.7    0.012    0.014    2.060    3.106
 mp_irecv_dv                       6231 10.9    2.042    3.081    2.042    3.081
 ls_scf_post                          1  4.0    0.000    0.000    3.058    3.072
 ls_scf_store_result                  1  5.0    0.000    0.000    2.877    2.919
 dbcsr_special_finalize             555  9.7    0.005    0.006    2.259    2.734
 dbcsr_merge_single_wm              555 10.7    0.459    0.594    2.251    2.727
 dbcsr_matrix_vector_mult           304  9.0    0.003    0.010    2.376    2.628
 make_images_pack                   222  9.7    2.203    2.620    2.205    2.621
 dbcsr_matrix_vector_mult_local     304 10.0    2.067    2.479    2.069    2.481
 dbcsr_sort_data                    658 11.4    2.051    2.458    2.051    2.458
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.372    2.444
 buffer_matrices_ensure_size        222  8.7    1.757    2.047    1.757    2.047
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.825    1.826
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.814    1.816
 qs_ks_build_kohn_sham_matrix         3  8.3    0.001    0.002    1.814    1.816
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="500", plot="h2o_32_nrep3_ls", label="(8n/12r/1t)", y=86.161000, yerr=0.000000
PlotPoint: name="501", plot="h2o_32_nrep3_ls_mem", label="(8n/12r/1t)", y=1142.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/22/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.588524E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                368848       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26119.8
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.101662E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  106560
 MPI messages size (bytes):
  total size                         2.699093E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      25.329324E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               23040                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                3264                325830144
    131072 < size <=  4194304                5280               3328561104
   4194304 < size <= 16777216               12709             156766962056
  16777216 < size                           62267            2538670978840
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266696.
 MP_Allreduce         3058                  10339.
 MP_Sync                 4
 MP_Alltoall            47               15335933.
 MP_SendRecv           141                  57600.
 MP_ISendRecv          141                  57600.
 MP_Wait               687
 MP_ISend              462                 414589.
 MP_IRecv              462                 413870.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.077    0.122   92.516   92.521
 qs_energies                          1  2.0    0.000    0.000   91.770   91.774
 ls_scf                               1  3.0    0.003    0.021   90.421   90.425
 dbcsr_multiply_generic             111  6.7    0.015    0.015   75.843   76.168
 multiply_cannon                    111  7.7    0.028    0.044   53.654   57.078
 ls_scf_main                          1  4.0    0.000    0.000   55.508   55.518
 multiply_cannon_loop               111  8.7    0.115    0.122   50.356   53.178
 density_matrix_trs4                  2  5.0    0.002    0.003   49.683   49.869
 mp_waitall_1                      9105 10.9   21.760   31.401   21.760   31.401
 ls_scf_init_scf                      1  4.0    0.000    0.003   31.321   31.323
 ls_scf_init_matrix_S                 1  5.0    0.000    0.002   29.863   29.967
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   27.423   27.436
 multiply_cannon_multrec           1332  9.7   13.054   16.685   22.340   27.100
 multiply_cannon_metrocomm3        1332  9.7    0.006    0.008   12.250   21.730
 make_m2s                           222  7.7    0.006    0.008   15.499   16.115
 make_images                        222  8.7    1.577    1.966   15.468   16.084
 dbcsr_mm_accdrv_process           4041 10.4    0.280    0.434    8.889   10.448
 dbcsr_mm_accdrv_process_sort      4041 11.4    8.486   10.023    8.486   10.023
 make_images_data                   222  9.7    0.004    0.004    8.929    9.826
 hybrid_alltoall_any                227 10.6    0.522    2.479    8.302    9.451
 mp_sum_l                           807  5.4    5.397    8.798    5.397    8.798
 multiply_cannon_metrocomm4        1221  9.7    0.006    0.008    3.247    7.767
 mp_irecv_dv                       3311 11.0    3.227    7.716    3.227    7.716
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    4.225    7.076
 calculate_norms                   2376  9.8    5.981    6.690    5.981    6.690
 multiply_cannon_sync_h2d          1332  9.7    4.753    5.765    4.753    5.765
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.041    5.240
 arnoldi_extremal                     4  6.8    0.000    0.000    4.833    4.855
 arnoldi_normal_ev                    4  7.8    0.001    0.004    4.833    4.855
 build_subspace                      16  8.4    0.014    0.021    4.575    4.579
 ls_scf_post                          1  4.0    0.000    0.001    3.589    3.593
 dbcsr_matrix_vector_mult           304  9.0    0.005    0.017    3.252    3.451
 ls_scf_store_result                  1  5.0    0.000    0.000    3.258    3.411
 dbcsr_matrix_vector_mult_local     304 10.0    2.746    3.224    2.748    3.225
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.004    1.215    3.186
 ls_scf_dm_to_ks                      2  5.0    0.001    0.027    2.664    2.757
 make_images_pack                   222  9.7    2.022    2.414    2.024    2.416
 mp_allgather_i34                   111  8.7    0.997    2.351    0.997    2.351
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.116    2.118
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.103    2.105
 qs_ks_build_kohn_sham_matrix         3  8.3    0.000    0.001    2.103    2.105
 dbcsr_sort_data                    436 11.2    1.804    2.045    1.804    2.045
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="502", plot="h2o_32_nrep3_ls", label="(8n/6r/2t)", y=92.521000, yerr=0.000000
PlotPoint: name="503", plot="h2o_32_nrep3_ls_mem", label="(8n/6r/2t)", y=1748.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/23/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      8.404608E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                353133       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   27282.1
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.696159E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   46176
 MPI messages size (bytes):
  total size                         1.924064E+12
  min size                           0.000000E+00
  max size                         108.059888E+06
  average size                      41.668048E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                9984                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                3328               1170063360
   4194304 < size <= 16777216                1870              19378539600
  16777216 < size                           30994            1903514987232
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265470.
 MP_Allreduce         3058                  11181.
 MP_Sync                 4
 MP_Alltoall            47               23526250.
 MP_SendRecv            93                  57600.
 MP_ISendRecv           93                  57600.
 MP_Wait               639
 MP_ISend              462                 560046.
 MP_IRecv              462                 560662.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.218    0.375   96.993   97.031
 qs_energies                          1  2.0    0.005    0.023   95.471   95.511
 ls_scf                               1  3.0    0.004    0.033   93.787   93.798
 dbcsr_multiply_generic             111  6.7    0.016    0.022   77.773   78.052
 ls_scf_main                          1  4.0    0.003    0.020   58.467   58.488
 multiply_cannon                    111  7.7    0.043    0.105   53.324   57.308
 multiply_cannon_loop               111  8.7    0.099    0.105   49.755   53.880
 density_matrix_trs4                  2  5.0    0.003    0.021   52.466   52.652
 mp_waitall_1                      7281 11.0   24.586   35.149   24.586   35.149
 ls_scf_init_scf                      1  4.0    0.008    0.023   31.663   31.706
 ls_scf_init_matrix_S                 1  5.0    0.002    0.017   30.026   30.111
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   27.490   27.509
 multiply_cannon_multrec            888  9.7   12.557   15.090   21.183   24.309
 multiply_cannon_metrocomm3         888  9.7    0.004    0.004   11.298   24.121
 make_m2s                           222  7.7    0.006    0.010   17.491   18.697
 make_images                        222  8.7    1.974    2.298   17.452   18.658
 hybrid_alltoall_any                227 10.6    0.620    2.858    9.652   11.220
 make_images_data                   222  9.7    0.004    0.004    9.981   11.009
 dbcsr_mm_accdrv_process           3754 10.4    0.238    0.438    8.143    9.375
 mp_sum_l                           807  5.4    5.464    9.271    5.464    9.271
 dbcsr_mm_accdrv_process_sort      3754 11.4    7.768    8.937    7.768    8.937
 multiply_cannon_sync_h2d           888  9.7    6.035    7.560    6.035    7.560
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    4.191    7.238
 multiply_cannon_metrocomm4         777  9.7    0.004    0.005    2.441    6.959
 mp_irecv_dv                       2335 11.1    2.426    6.915    2.426    6.915
 multiply_cannon_metrocomm1         888  9.7    0.002    0.003    3.929    6.870
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.042    5.248
 arnoldi_extremal                     4  6.8    0.000    0.000    5.202    5.223
 arnoldi_normal_ev                    4  7.8    0.001    0.005    5.202    5.223
 build_subspace                      16  8.4    0.014    0.020    4.882    4.887
 calculate_norms                   1584  9.8    4.255    4.616    4.255    4.616
 mp_allgather_i34                   111  8.7    1.410    3.901    1.410    3.901
 dbcsr_matrix_vector_mult           304  9.0    0.005    0.016    3.507    3.818
 ls_scf_post                          1  4.0    0.006    0.027    3.653    3.667
 dbcsr_matrix_vector_mult_local     304 10.0    3.032    3.613    3.034    3.615
 ls_scf_store_result                  1  5.0    0.000    0.000    3.375    3.478
 ls_scf_dm_to_ks                      2  5.0    0.001    0.016    2.750    2.846
 make_images_sizes                  222  9.7    0.000    0.000    1.161    2.413
 mp_alltoall_i44                    222 10.7    1.161    2.413    1.161    2.413
 dbcsr_sort_data                    325 11.1    1.891    2.165    1.891    2.165
 make_images_pack                   222  9.7    1.838    2.136    1.840    2.139
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.067    2.108
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.048    2.089
 qs_ks_build_kohn_sham_matrix         3  8.3    0.015    0.068    2.048    2.089
 dbcsr_data_release                9322 10.9    1.318    1.952    1.318    1.952
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="504", plot="h2o_32_nrep3_ls", label="(8n/4r/3t)", y=97.031000, yerr=0.000000
PlotPoint: name="505", plot="h2o_32_nrep3_ls_mem", label="(8n/4r/3t)", y=2155.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/24/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     10.747127E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                369794       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26053.0
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               3.357680E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   50616
 MPI messages size (bytes):
  total size                         1.536549E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      30.356986E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               10368                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1056                104411904
    131072 < size <=  4194304                3168                831638784
   4194304 < size <= 16777216                3103              33613273640
  16777216 < size                           32921            1501999894888
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266696.
 MP_Allreduce         3058                  13371.
 MP_Sync                 4
 MP_Alltoall            47               30278988.
 MP_SendRecv            69                  86400.
 MP_ISendRecv           69                  86400.
 MP_Wait               531
 MP_ISend              378                 823502.
 MP_IRecv              378                 823753.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.271    0.395   98.777   98.789
 qs_energies                          1  2.0    0.001    0.011   97.337   97.364
 ls_scf                               1  3.0    0.004    0.017   95.572   95.589
 dbcsr_multiply_generic             111  6.7    0.016    0.017   78.763   79.046
 ls_scf_main                          1  4.0    0.000    0.002   59.135   59.140
 multiply_cannon                    111  7.7    0.046    0.099   51.989   56.579
 density_matrix_trs4                  2  5.0    0.003    0.018   53.025   53.141
 multiply_cannon_loop               111  8.7    0.115    0.125   46.850   49.586
 ls_scf_init_scf                      1  4.0    0.005    0.023   33.154   33.156
 ls_scf_init_matrix_S                 1  5.0    0.000    0.002   31.618   31.702
 mp_waitall_1                      6369 11.0   23.107   29.325   23.107   29.325
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   29.129   29.141
 multiply_cannon_multrec           1332  9.7   14.146   16.880   22.065   24.707
 make_m2s                           222  7.7    0.006    0.008   21.143   22.600
 make_images                        222  8.7    3.138    3.592   21.092   22.552
 multiply_cannon_metrocomm3        1332  9.7    0.003    0.003    9.470   17.367
 make_images_data                   222  9.7    0.004    0.004   11.846   13.469
 hybrid_alltoall_any                227 10.6    0.796    3.825   11.234   13.170
 dbcsr_mm_accdrv_process           3641 10.4    0.216    0.403    7.561    9.107
 dbcsr_mm_accdrv_process_sort      3641 11.4    7.191    8.689    7.191    8.689
 mp_sum_l                           807  5.4    4.120    7.855    4.120    7.855
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    3.132    6.127
 multiply_cannon_sync_h2d          1332  9.7    5.497    6.083    5.497    6.083
 multiply_cannon_metrocomm4        1110  9.7    0.004    0.006    2.063    5.964
 mp_irecv_dv                       3229 10.9    2.040    5.884    2.040    5.884
 arnoldi_extremal                     4  6.8    0.000    0.000    5.441    5.468
 arnoldi_normal_ev                    4  7.8    0.001    0.004    5.441    5.468
 build_subspace                      16  8.4    0.014    0.021    5.087    5.093
 mp_allgather_i34                   111  8.7    2.247    4.770    2.247    4.770
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.589    4.766
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.003    2.620    4.698
 calculate_norms                   2376  9.8    4.188    4.492    4.188    4.492
 dbcsr_matrix_vector_mult           304  9.0    0.006    0.017    3.624    3.964
 dbcsr_matrix_vector_mult_local     304 10.0    3.208    3.714    3.211    3.715
 dbcsr_sort_data                    658 11.4    3.096    3.395    3.096    3.395
 ls_scf_post                          1  4.0    0.006    0.022    3.278    3.299
 dbcsr_special_finalize             555  9.7    0.006    0.007    2.840    3.169
 dbcsr_merge_single_wm              555 10.7    0.538    0.658    2.831    3.160
 ls_scf_store_result                  1  5.0    0.000    0.000    2.976    3.067
 ls_scf_dm_to_ks                      2  5.0    0.001    0.020    2.922    2.989
 dbcsr_data_release               10477 10.7    1.584    2.432    1.584    2.432
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.077    2.079
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.054    2.056
 qs_ks_build_kohn_sham_matrix         3  8.3    0.004    0.016    2.054    2.056
 dbcsr_finalize                     304  7.8    0.049    0.061    1.806    1.983
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="506", plot="h2o_32_nrep3_ls", label="(8n/3r/4t)", y=98.789000, yerr=0.000000
PlotPoint: name="507", plot="h2o_32_nrep3_ls_mem", label="(8n/3r/4t)", y=2704.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/25/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     15.383312E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                336818       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28603.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               4.643148E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                   10656
 MPI messages size (bytes):
  total size                         1.149035E+12
  min size                           0.000000E+00
  max size                         203.538048E+06
  average size                     107.829832E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                2304                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 768                702038016
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            7584            1148332810224
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                2                     12.
 MP_Allreduce          705                    128.
 MP_Alltoall           310               12920694.
 MP_ISend             1776               40180424.
 MP_IRecv             1776               40465030.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265558.
 MP_Allreduce         3049                  15663.
 MP_Sync                 4
 MP_Alltoall            47               46208988.
 MP_SendRecv            45                 115200.
 MP_ISendRecv           45                 115200.
 MP_Wait               528
 MP_ISend              420                 924980.
 MP_IRecv              420                 924528.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.086    0.112   99.221   99.228
 qs_energies                          1  2.0    0.000    0.000   98.204   98.210
 ls_scf                               1  3.0    0.000    0.000   96.283   96.288
 dbcsr_multiply_generic             111  6.7    0.017    0.018   77.858   78.076
 ls_scf_main                          1  4.0    0.000    0.000   62.042   62.043
 multiply_cannon                    111  7.7    0.070    0.170   55.275   60.886
 density_matrix_trs4                  2  5.0    0.002    0.003   54.981   55.080
 multiply_cannon_loop               111  8.7    0.069    0.077   50.682   52.517
 mp_waitall_1                      5436 11.0   26.444   32.423   26.444   32.423
 ls_scf_init_scf                      1  4.0    0.000    0.000   30.662   30.668
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   29.461   29.493
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   27.292   27.299
 multiply_cannon_multrec            444  9.7   13.925   16.194   21.035   23.654
 make_m2s                           222  7.7    0.004    0.005   17.695   20.250
 make_images                        222  8.7    3.723    4.415   17.633   20.191
 multiply_cannon_metrocomm1         444  9.7    0.002    0.002   11.160   16.206
 multiply_cannon_metrocomm3         444  9.7    0.001    0.001    6.135   15.239
 make_images_data                   222  9.7    0.003    0.004    9.949   12.435
 hybrid_alltoall_any                227 10.6    0.790    3.772    9.774   12.319
 dbcsr_mm_accdrv_process           3003 10.4    0.186    0.347    6.817    7.941
 multiply_cannon_sync_h2d           444  9.7    6.500    7.892    6.500    7.892
 dbcsr_mm_accdrv_process_sort      3003 11.4    6.503    7.597    6.503    7.597
 mp_allgather_i34                   111  8.7    2.813    6.994    2.813    6.994
 arnoldi_extremal                     4  6.8    0.000    0.000    5.936    5.944
 arnoldi_normal_ev                    4  7.8    0.001    0.005    5.936    5.944
 build_subspace                      16  8.4    0.015    0.020    5.532    5.547
 mp_sum_l                           807  5.4    2.942    5.353    2.942    5.353
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.612    4.752
 dbcsr_matrix_vector_mult           304  9.0    0.007    0.016    4.263    4.440
 dbcsr_matrix_vector_mult_local     304 10.0    3.739    4.218    3.741    4.220
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    2.017    4.029
 multiply_cannon_metrocomm4         333  9.7    0.001    0.002    1.673    3.864
 mp_irecv_dv                       1241 11.2    1.653    3.819    1.653    3.819
 calculate_norms                    792  9.8    3.540    3.684    3.540    3.684
 ls_scf_post                          1  4.0    0.000    0.000    3.578    3.584
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    3.417    3.525
 make_images_sizes                  222  9.7    0.000    0.000    1.101    3.475
 mp_alltoall_i44                    222 10.7    1.100    3.475    1.100    3.475
 ls_scf_store_result                  1  5.0    0.000    0.000    3.356    3.407
 dbcsr_finalize                     304  7.8    0.062    0.078    2.208    2.310
 dbcsr_merge_all                    275  8.9    0.473    0.531    2.054    2.142
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.988    1.989
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="508", plot="h2o_32_nrep3_ls", label="(8n/2r/6t)", y=99.228000, yerr=0.000000
PlotPoint: name="509", plot="h2o_32_nrep3_ls_mem", label="(8n/2r/6t)", y=3662.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3_performance_tests/26/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     30.358840E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                339931       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28341.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               8.781820E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                    4440
 MPI messages size (bytes):
  total size                       770.525954E+09
  min size                           0.000000E+00
  max size                         399.069120E+06
  average size                     173.541888E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 640                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 640                468025344
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            3160             770057961712
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 284111.
 MP_Allreduce         3043                  21950.
 MP_Sync                 4
 MP_Alltoall            47               88727262.
 MP_SendRecv            42                 732600.
 MP_ISendRecv           42                 732600.
 MP_Wait               267
 MP_ISend              180                3337386.
 MP_IRecv              180                3339494.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.101    0.191  112.205  112.249
 qs_energies                          1  2.0    0.000    0.000  110.713  110.727
 ls_scf                               1  3.0    0.000    0.000  107.823  107.836
 dbcsr_multiply_generic             111  6.7    0.023    0.026   80.937   81.055
 ls_scf_main                          1  4.0    0.000    0.000   69.235   69.236
 density_matrix_trs4                  2  5.0    0.002    0.003   59.849   59.912
 multiply_cannon                    111  7.7    0.144    0.301   52.277   53.710
 multiply_cannon_loop               111  8.7    0.067    0.069   48.968   50.276
 ls_scf_init_scf                      1  4.0    0.000    0.000   34.724   34.725
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   33.364   33.375
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   30.477   30.489
 mp_waitall_1                      4527 11.1   25.600   28.874   25.600   28.874
 make_m2s                           222  7.7    0.005    0.005   24.800   25.639
 make_images                        222  8.7    4.591    4.955   24.694   25.531
 multiply_cannon_multrec            444  9.7   17.777   18.645   22.421   23.086
 hybrid_alltoall_any                227 10.6    1.661    3.611   13.619   16.628
 make_images_data                   222  9.7    0.003    0.003   13.936   16.321
 multiply_cannon_metrocomm3         444  9.7    0.001    0.001   12.828   13.928
 multiply_cannon_sync_h2d           444  9.7    8.844    8.898    8.844    8.898
 arnoldi_extremal                     4  6.8    0.000    0.000    7.958    7.970
 arnoldi_normal_ev                    4  7.8    0.003    0.009    7.958    7.970
 build_subspace                      16  8.4    0.025    0.036    7.399    7.409
 dbcsr_matrix_vector_mult           304  9.0    0.009    0.025    5.935    6.098
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.282    5.533
 dbcsr_matrix_vector_mult_local     304 10.0    5.136    5.468    5.139    5.470
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    5.154    5.304
 dbcsr_mm_accdrv_process           1814 10.4    0.206    0.318    4.479    4.617
 dbcsr_mm_accdrv_process_sort      1814 11.4    4.182    4.320    4.182    4.320
 ls_scf_post                          1  4.0    0.000    0.000    3.863    3.877
 make_images_sizes                  222  9.7    0.000    0.000    1.536    3.734
 mp_alltoall_i44                    222 10.7    1.535    3.734    1.535    3.734
 ls_scf_store_result                  1  5.0    0.000    0.000    3.571    3.580
 calculate_norms                    792  9.8    3.247    3.283    3.247    3.283
 dbcsr_finalize                     304  7.8    0.082    0.089    3.087    3.126
 dbcsr_complete_redistribute          5  7.6    1.428    1.472    2.894    3.041
 dbcsr_merge_all                    275  8.9    0.890    0.915    2.873    2.911
 qs_energies_init_hamiltonians        1  3.0    0.008    0.014    2.860    2.860
 mp_allgather_i34                   111  8.7    0.947    2.845    0.947    2.845
 dbcsr_data_release               12724 10.6    2.323    2.830    2.323    2.830
 matrix_ls_to_qs                      2  6.0    0.000    0.000    2.531    2.695
 dbcsr_sort_data                    325 11.1    2.443    2.501    2.443    2.501
 dbcsr_new_transposed                 4  7.5    0.260    0.287    2.328    2.342
 mp_sum_l                           807  5.4    1.558    2.282    1.558    2.282
 mp_alltoall_d11v                    48  9.2    2.198    2.251    2.198    2.251
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="510", plot="h2o_32_nrep3_ls", label="(8n/1r/12t)", y=112.249000, yerr=0.000000
PlotPoint: name="511", plot="h2o_32_nrep3_ls_mem", label="(8n/1r/12t)", y=6851.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


========= END RESULTS ===========

CommitSHA: 7c2cbadb1dfa2ab43dbf397fe5b72069add6d4c3
Summary: empty
Status: OK