=== This is the CP2K Performance-Test ===


Already up to date.
Current branch master is up to date.


Already up to date.
Current branch master is up to date.

 GIT Revision: 1501a5cde42d9e664f251d02093fe2fc81c3abfc


################# ARCHITECTURE FILE ##################
#!/bin/bash
#
# CP2K arch file for Cray-XC50 (Piz Daint, CSCS, GPU partition)
#
# Tested with: GNU 9.3.0, Cray-MPICH 7.7.18, Cray-libsci 20.09.1, Cray-FFTW 3.3.8.10,
#              COSMA 2.6.6, ELPA 2022.11.001, LIBINT 2.6.0, LIBPEXSI 1.2.0,
#              LIBXC 6.2.2, LIBVORI 220621, LIBXSMM 1.17, PLUMED 2.8.2,
#              SIRIUS 7.4.3, SPGLIB 1.16.2
#
# Usage: Source this arch file and then run make as instructed.
#        A full toolchain installation is performed as default.
#        Replace or adapt the "module add" commands below if needed.
#
# Last update: 21.06.2023
#
# \
   if [ "${0}" = "${BASH_SOURCE}" ]; then \
      echo "ERROR: Script ${0##*/} must be sourced"; \
      echo "Usage: source ${0##*/}"; \
      exit 1; \
   fi; \
   this_file=${BASH_SOURCE##*/}; \
   if [ -n "${1}" ]; then \
      gcc_version="${1}"; \
   else \
      gcc_version="9.3.0"; \
   fi; \
   module add daint-gpu; \
   module rm PrgEnv-cray; \
   module add PrgEnv-gnu; \
   module rm gcc; \
   module add gcc/${gcc_version}; \
   module add cray-fftw/3.3.8.10; \
   module add cudatoolkit; \
   echo "Expected setup:"; \
   echo "   cray-mpich/7.7.18"; \
   echo "   craype-haswell"; \
   echo "   daint-gpu/21.09"; \
   echo "   craype/2.7.10"; \
   echo "   cray-libsci/20.09.1"; \
   echo "   PrgEnv-gnu/6.0.10"; \
   echo "   gcc/${gcc_version}"; \
   echo "   cray-fftw/3.3.8.10"; \
   echo "   cudatoolkit/11.0.2_3.38-8.1__g5b73779"; \
   module list; \
   module -f save cp2k_gpu_gnu_psmp; \
   echo "To load the required modules in your batch job script, use:"; \
   echo "   module restore cp2k_gpu_gnu_psmp"; \
   cd tools/toolchain; \
   ./install_cp2k_toolchain.sh --enable-cuda=yes --gpu-ver=P100 -j${maxtasks} --no-arch-files --with-gcc=system --with-libvdwxc --with-pexsi --with-plumed; \
   cd ../..; \
   printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \
   source ${PWD}/tools/toolchain/install/setup; \
   printf "done\n"; \
   echo "Check the output above for error messages and consistency!"; \
   echo; \
   echo "If everything is OK, you can build a CP2K production binary with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \
   echo; \
   echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \
   echo "or build CP2K as a library with"; \
   echo "   make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \
   echo; \
   return

# Set options
DO_CHECKS      := no
USE_ACC        := yes
USE_COSMA      := 2.6.6
USE_ELPA       := 2022.11.001
USE_LIBINT     := 2.6.0
USE_LIBPEXSI   := 1.2.0
USE_LIBVORI    := 220621
USE_LIBXC      := 6.2.2
USE_LIBXSMM    := 1.17
USE_PLUMED     := 2.8.2
#USE_QUIP       := 0.9.10
USE_SIRIUS     := 7.4.3
USE_SPGLIB     := 1.16.2
# Only needed for SIRIUS
LIBVDWXC_VER   := 0.4.0
SPFFT_VER      := 1.0.6
SPLA_VER       := 1.5.5
HDF5_VER       := 1.12.0
# Only needed for LIBPEXSI
SCOTCH_VER     := 6.0.0
SUPERLU_VER    := 6.1.0

LMAX           := 5
MAX_CONTR      := 4

GPUVER         := P100
OFFLOAD_TARGET := cuda

CC             := cc
CXX            := CC
OFFLOAD_CC     := nvcc
FC             := ftn
LD             := ftn
AR             := ar -r

# cc, CC, and ftn include already the proper -march flag
CFLAGS         := -O2 -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g

DFLAGS         := -D__parallel
DFLAGS         += -D__SCALAPACK
DFLAGS         += -D__FFTW3
DFLAGS         += -D__MAX_CONTR=$(strip $(MAX_CONTR))

INSTALL_PATH   := $(PWD)/tools/toolchain/install

ifeq ($(DO_CHECKS), yes)
   DFLAGS         += -D__CHECK_DIAG
endif

ifeq ($(USE_ACC), yes)
   DFLAGS         += -D__DBCSR_ACC
   DFLAGS         += -D__OFFLOAD_CUDA
# Possibly no performance gain with PW_CUDA currently
   DFLAGS         += -D__NO_OFFLOAD_PW
endif

ifneq ($(USE_PLUMED),)
   USE_PLUMED     := $(strip $(USE_PLUMED))
   PLUMED_LIB     := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib
   DFLAGS         += -D__PLUMED2
   USE_GSL        := 2.7
   LIBS           += $(PLUMED_LIB)/libplumed.a
endif

ifneq ($(USE_ELPA),)
   USE_ELPA       := $(strip $(USE_ELPA))
   TARGET         := nvidia
   ELPA_INC       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa-$(USE_ELPA)
   ELPA_LIB       := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib
   CFLAGS         += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules
   DFLAGS         += -D__ELPA
   ifeq ($(TARGET), nvidia)
      DFLAGS         += -D__ELPA_NVIDIA_GPU
   endif
   LIBS           += $(ELPA_LIB)/libelpa.a
endif

ifneq ($(USE_QUIP),)
   USE_QUIP       := $(strip $(USE_QUIP))
   QUIP_INC       := $(INSTALL_PATH)/quip-$(USE_QUIP)/include
   QUIP_LIB       := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib
   CFLAGS         += -I$(QUIP_INC)
   DFLAGS         += -D__QUIP
   LIBS           += $(QUIP_LIB)/libquip_core.a
   LIBS           += $(QUIP_LIB)/libatoms.a
   LIBS           += $(QUIP_LIB)/libFoX_sax.a
   LIBS           += $(QUIP_LIB)/libFoX_common.a
   LIBS           += $(QUIP_LIB)/libFoX_utils.a
   LIBS           += $(QUIP_LIB)/libFoX_fsys.a
endif

ifneq ($(USE_LIBPEXSI),)
   USE_LIBPEXSI   := $(strip $(USE_LIBPEXSI))
   SCOTCH_VER     := $(strip $(SCOTCH_VER))
   SUPERLU_VER    := $(strip $(SUPERLU_VER))
   LIBPEXSI_INC   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include
   LIBPEXSI_LIB   := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib
   SCOTCH_INC     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include
   SCOTCH_LIB     := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib
   SUPERLU_INC    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include
   SUPERLU_LIB    := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib
   CFLAGS         += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC)
   DFLAGS         += -D__LIBPEXSI
   LIBS           += $(LIBPEXSI_LIB)/libpexsi.a
   LIBS           += $(SUPERLU_LIB)/libsuperlu_dist.a
   LIBS           += $(SCOTCH_LIB)/libptscotchparmetis.a
   LIBS           += $(SCOTCH_LIB)/libptscotch.a
   LIBS           += $(SCOTCH_LIB)/libptscotcherr.a
   LIBS           += $(SCOTCH_LIB)/libscotchmetis.a
   LIBS           += $(SCOTCH_LIB)/libscotch.a
endif

ifneq ($(USE_LIBVORI),)
   USE_LIBVORI    := $(strip $(USE_LIBVORI))
   LIBVORI_LIB    := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib
   DFLAGS         += -D__LIBVORI
   LIBS           += $(LIBVORI_LIB)/libvori.a
endif

ifneq ($(USE_LIBXC),)
   USE_LIBXC      := $(strip $(USE_LIBXC))
   LIBXC_INC      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include
   LIBXC_LIB      := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib
   CFLAGS         += -I$(LIBXC_INC)
   DFLAGS         += -D__LIBXC
   LIBS           += $(LIBXC_LIB)/libxcf03.a
   LIBS           += $(LIBXC_LIB)/libxc.a
endif

ifneq ($(USE_LIBINT),)
   USE_LIBINT     := $(strip $(USE_LIBINT))
   LMAX           := $(strip $(LMAX))
   LIBINT_INC     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include
   LIBINT_LIB     := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib
   CFLAGS         += -I$(LIBINT_INC)
   DFLAGS         += -D__LIBINT
   LIBS           += $(LIBINT_LIB)/libint2.a
endif

ifneq ($(USE_SPGLIB),)
   USE_SPGLIB     := $(strip $(USE_SPGLIB))
   SPGLIB_INC     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include
   SPGLIB_LIB     := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib
   CFLAGS         += -I$(SPGLIB_INC)
   DFLAGS         += -D__SPGLIB
   LIBS           += $(SPGLIB_LIB)/libsymspg.a
endif

ifneq ($(USE_LIBXSMM),)
   USE_LIBXSMM    := $(strip $(USE_LIBXSMM))
   LIBXSMM_INC    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include
   LIBXSMM_LIB    := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib
   CFLAGS         += -I$(LIBXSMM_INC)
   DFLAGS         += -D__LIBXSMM
   LIBS           += $(LIBXSMM_LIB)/libxsmmf.a
   LIBS           += $(LIBXSMM_LIB)/libxsmm.a
endif

ifneq ($(USE_SIRIUS),)
   USE_SIRIUS     := $(strip $(USE_SIRIUS))
   HDF5_VER       := $(strip $(HDF5_VER))
   HDF5_LIB       := $(INSTALL_PATH)/hdf5-$(HDF5_VER)/lib
   LIBVDWXC_VER   := $(strip $(LIBVDWXC_VER))
   LIBVDWXC_INC   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include
   LIBVDWXC_LIB   := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib
   SPFFT_VER      := $(strip $(SPFFT_VER))
   SPFFT_INC      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/include
   SPLA_VER       := $(strip $(SPLA_VER))
   SPLA_INC       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/include/spla
   ifeq ($(USE_ACC), yes)
      DFLAGS         += -D__OFFLOAD_GEMM
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib/cuda
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib/cuda
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include/cuda
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib/cuda
   else
      SPFFT_LIB      := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib
      SPLA_LIB       := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib
      SIRIUS_INC     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include
      SIRIUS_LIB     := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib
   endif
   CFLAGS         += -I$(LIBVDWXC_INC)
   CFLAGS         += -I$(SPFFT_INC)
   CFLAGS         += -I$(SPLA_INC)
   CFLAGS         += -I$(SIRIUS_INC)
   DFLAGS         += -D__HDF5
   DFLAGS         += -D__LIBVDWXC
   DFLAGS         += -D__SPFFT
   DFLAGS         += -D__SPLA
   DFLAGS         += -D__SIRIUS
   LIBS           += $(SIRIUS_LIB)/libsirius.a
   LIBS           += $(SPLA_LIB)/libspla.a
   LIBS           += $(SPFFT_LIB)/libspfft.a
   LIBS           += $(LIBVDWXC_LIB)/libvdwxc.a
   LIBS           += $(HDF5_LIB)/libhdf5.a
endif

ifneq ($(USE_COSMA),)
   USE_COSMA      := $(strip $(USE_COSMA))
   ifeq ($(USE_ACC), yes)
      USE_COSMA      := $(USE_COSMA)-cuda
   endif
   COSMA_INC      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include
   COSMA_LIB      := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib
   CFLAGS         += -I$(COSMA_INC)
   DFLAGS         += -D__COSMA
   LIBS           += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a
   LIBS           += $(COSMA_LIB)/libcosma.a
   LIBS           += $(COSMA_LIB)/libcosta.a
   LIBS           += $(COSMA_LIB)/libTiled-MM.a
endif

ifneq ($(USE_GSL),)
   USE_GSL        := $(strip $(USE_GSL))
   GSL_INC        := $(INSTALL_PATH)/gsl-$(USE_GSL)/include
   GSL_LIB        := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib
   CFLAGS         += -I$(GSL_INC)
   DFLAGS         += -D__GSL
   LIBS           += $(GSL_LIB)/libgsl.a
endif

CFLAGS         += $(DFLAGS)

CXXFLAGS       := $(CFLAGS) -std=c++11

OFFLOAD_FLAGS  := $(DFLAGS) -O3 -Xcompiler="-fopenmp" -arch sm_60 --std=c++11

FCFLAGS        := $(CFLAGS)
ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes)
   FCFLAGS        += -fallow-argument-mismatch
endif
FCFLAGS        += -fbacktrace
FCFLAGS        += -ffree-form
FCFLAGS        += -ffree-line-length-none
FCFLAGS        += -fno-omit-frame-pointer
FCFLAGS        += -std=f2008

ifneq ($(CUDA_HOME),)
   CUDA_LIB       := $(CUDA_HOME)/lib64
   LDFLAGS        := $(FCFLAGS) -L$(CUDA_LIB) -Wl,-rpath=$(CUDA_LIB)
else
   LDFLAGS        := $(FCFLAGS)
endif

LIBS           += -lcusolver -lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt
LIBS           += -lz -ldl -lpthread -lstdc++

# End
############### END ARCHITECTURE FILE ################


===== TESTS (description) =====
 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-RPA.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-dRPA-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/01
 job id: 47541074
 --- Point ---
 name: 10
 plot: h2o_32_ri_rpa_mp2
 regex: Total RI-RPA Time= 
 label: RI-RPA (8n/2r/6t)
 --- Point ---
 name: 11
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-RPA (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 RI-RPA/RI-MP2 correlation energy
 input file: benchmarks/QS_mp2_rpa/32-H2O/RI-MP2.inp
 required files: ['benchmarks/QS_mp2_rpa/32-H2O/BASIS_H2O', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32.xyz', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-PBE-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-HF-TZ.inp', 'benchmarks/QS_mp2_rpa/32-H2O/H2O-32-RI-MP2-TZ.inp']
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 15
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/02
 job id: 47541076
 --- Point ---
 name: 20
 plot: h2o_32_ri_rpa_mp2
 regex: Total MP2 Time= 
 label: RI-MP2 (8n/6r/2t)
 --- Point ---
 name: 21
 plot: h2o_32_ri_rpa_mp2_mem
 regex: Estimated peak process memory 
 label: RI-MP2 (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/03
 job id: 47541078
 --- Point ---
 name: 100
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 101
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/04
 job id: 47541081
 --- Point ---
 name: 102
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 103
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/05
 job id: 47541083
 --- Point ---
 name: 104
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 105
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/06
 job id: 47541085
 --- Point ---
 name: 106
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 107
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/07
 job id: 47541087
 --- Point ---
 name: 108
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 109
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-64 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-64.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 5
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/08
 job id: 47541089
 --- Point ---
 name: 110
 plot: h2o_64_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 111
 plot: h2o_64_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/09
 job id: 47541090
 --- Point ---
 name: 200
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 201
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/10
 job id: 47541091
 --- Point ---
 name: 202
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 203
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/11
 job id: 47541092
 --- Point ---
 name: 204
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 205
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/12
 job id: 47541094
 --- Point ---
 name: 206
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 207
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/13
 job id: 47541095
 --- Point ---
 name: 208
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 209
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-128 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-128.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/14
 job id: 47541097
 --- Point ---
 name: 210
 plot: h2o_128_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 211
 plot: h2o_128_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/15
 job id: 47541099
 --- Point ---
 name: 400
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 401
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/16
 job id: 47541100
 --- Point ---
 name: 402
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 403
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/17
 job id: 47541101
 --- Point ---
 name: 404
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 405
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/18
 job id: 47541104
 --- Point ---
 name: 406
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 407
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/19
 job id: 47541105
 --- Point ---
 name: 408
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 409
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-256 test - DBCSR dominated (MPI/OMP)
 input file: benchmarks/QS/H2O-256.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 30
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/20
 job id: 47541106
 --- Point ---
 name: 410
 plot: h2o_256_md
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 411
 plot: h2o_256_md_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/21
 job id: 47541108
 --- Point ---
 name: 500
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/12r/1t)
 --- Point ---
 name: 501
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 6
 # threads/rank = 2
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/22
 job id: 47541109
 --- Point ---
 name: 502
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/6r/2t)
 --- Point ---
 name: 503
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/6r/2t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 4
 # threads/rank = 3
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/23
 job id: 47541111
 --- Point ---
 name: 504
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/4r/3t)
 --- Point ---
 name: 505
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/4r/3t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 3
 # threads/rank = 4
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/24
 job id: 47541114
 --- Point ---
 name: 506
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/3r/4t)
 --- Point ---
 name: 507
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/3r/4t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 2
 # threads/rank = 6
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/25
 job id: 47541117
 --- Point ---
 name: 508
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/2r/6t)
 --- Point ---
 name: 509
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/2r/6t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: H2O-32 (NREP 3) linear scaling test (864 H2O)
 input file: benchmarks/QS_DM_LS/H2O-dft-ls.inp
 required files: []
 output file: result.log
 # nodes = 8
 # ranks/node = 1
 # threads/rank = 12
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/26
 job id: 47541118
 --- Point ---
 name: 510
 plot: h2o_32_nrep3_ls
 regex: CP2K  
 label: (8n/1r/12t)
 --- Point ---
 name: 511
 plot: h2o_32_nrep3_ls_mem
 regex: Estimated peak process memory 
 label: (8n/1r/12t)
 ~~~~~~~ END TEST ~~~~~~~

 ~~~~~~~~~ TEST ~~~~~~~~~
 description: 512 H2O (4 NVE MD steps on 64 nodes)
 input file: benchmarks/QS/00512_H2O/H2O-512_md.inp
 required files: []
 output file: result.log
 # nodes = 64
 # ranks/node = 12
 # threads/rank = 1
 nrepeat = 1
 time[min] = 10
 run dir: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/27
 job id: 47541122
 --- Point ---
 name: 601
 plot: h2o_512_md
 regex: CP2K  
 label: (64n/12r/1t)
 --- Point ---
 name: 602
 plot: h2o_512_md_mem
 regex: Estimated peak process memory 
 label: (64n/12r/1t)
 ~~~~~~~ END TEST ~~~~~~~

=== END TESTS (description) ===


===== PLOTS (description) =====
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_ri_rpa_mp2_mem", title="32 H2O molecules (RI-MP2, RI-RPA)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_64_md_mem", title="64 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_128_md_mem", title="128 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_256_md_mem", title="256 H2O molecules (10 MD steps)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_32_nrep3_ls_mem", title="864 H2O molecules (LS SCF)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_512_md", title="512 H2O (4 NVE MD steps on 64 nodes)", xlabel="Revision", ylabel="Time [s]"
 ~~~~~~~~~ PLOT ~~~~~~~~~
Plot: name="h2o_512_md_mem", title="512 H2O (4 NVE MD steps on 64 nodes)", xlabel="Revision", ylabel="Est. peak process memory [MiB]"
=== END PLOTS (description) ===


============ RESULTS ============
 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/01/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               15                 177869.
 MP_Allreduce          424                      8.
 MP_Sync                 3
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.021    0.036  134.533  134.534
 farming_run                          1  2.0  133.933  133.935  134.503  134.507
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32              4194304       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            154140672       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            159645696       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            208732160       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            212860928       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            212860928       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            227352576       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         896801644032       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         928925089792       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         928925089792       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         962100985856       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693169221632       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753639550976       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.164741E+12       0.0%      0.0%    100.0%
 flops max/rank                    447.801317E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249492158       0.0%      0.0%    100.0%
 number of processed stacks                164328       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1518.3
 marketing flops                     7.165779E+12
 -------------------------------------------------------------------------------
 # multiplications                           1160
 max memory usage/rank               1.466986E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                    2592
 MPI messages size (bytes):
  total size                         1.140326E+09
  min size                           0.000000E+00
  max size                           1.663488E+06
  average size                     439.940750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 132                        0
       128 < size <=     8192                 348                  2850816
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1536                179306496
    131072 < size <=  4194304                 576                958169088
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         2308                     54.
 MP_Alltoall          4670                 822215.
 MP_ISend             2604                  90577.
 MP_IRecv             2604                  90574.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              230                1134128.
 MP_Allreduce          571                1938539.
 MP_Sync                25
 MP_Alltoall            38                9316958.
 MP_SendRecv           120                 384007.
 MP_ISendRecv           45                 235435.
 MP_Wait               191
 MP_comm_split          10
 MP_ISend              127                3867574.
 MP_IRecv              127                3866554.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.006    0.025  115.285  115.286
 qs_energies                          1  2.0    0.000    0.000  115.019  115.019
 mp2_main                             1  3.0    0.000    0.000  112.854  112.854
 mp2_gpw_main                         1  4.0    0.019    0.023  111.857  111.858
 mp2_ri_gpw_compute_in                1  5.0    0.172    0.173   92.858   93.205
 mp2_ri_gpw_compute_in_loop           1  6.0    0.004    0.005   54.868   55.215
 mp2_eri_3c_integrate_gpw           272  7.0    0.154    0.169   41.317   46.420
 get_2c_integrals                     1  6.0    0.008    0.009   37.179   37.816
 integrate_v_rspace                 273  8.0    0.436    0.450   24.736   29.659
 pw_transfer                       6555 10.6    0.373    0.389   27.456   28.012
 fft_wrap_pw1pw2                   5465 11.4    0.046    0.048   26.088   26.480
 grid_integrate_task_list           273  9.0   20.549   25.938   20.549   25.938
 fft_wrap_pw1pw2_100               2178 12.4    1.231    1.445   23.607   24.026
 compute_2c_integrals                 1  7.0    0.002    0.002   19.732   19.733
 compute_2c_integrals_loop_lm         1  8.0    0.002    0.003   18.976   19.457
 mp2_eri_2c_integrate_gpw             1  9.0    2.371    2.446   18.974   19.454
 rpa_ri_compute_en                    1  5.0    0.019    0.019   18.893   19.025
 cp_fm_cholesky_decompose            12  8.2   17.378   18.001   17.378   18.001
 cholesky_decomp                      1  7.0    0.000    0.000   16.282   16.912
 fft3d_s                           5443 13.4   16.144   16.492   16.166   16.514
 ao_to_mo_and_store_B_mult_1        272  7.0   10.718   15.293   10.718   15.293
 calculate_wavefunction             272  8.0    5.401    5.543   12.512   13.142
 rpa_num_int                          1  6.0    0.000    0.000   10.634   10.635
 rpa_num_int_RPA_matrix_operati       8  7.0    0.000    0.000   10.594   10.627
 calc_mat_Q                           8  8.0    0.000    0.000    9.467    9.571
 contract_S_to_Q                      8  9.0    0.000    0.000    8.887    8.990
 calc_potential_gpw                 544  9.5    0.005    0.005    8.278    8.681
 mp2_eri_2c_integrate_gpw_pot_l     272 10.0    0.001    0.001    8.275    8.579
 parallel_gemm_fm                    14  9.1    0.000    0.000    8.483    8.569
 parallel_gemm_fm_cosma              14 10.1    8.483    8.569    8.483    8.569
 potential_pw2rs                    545 10.0    0.108    0.109    7.716    8.348
 create_integ_mat                     1  6.0    0.006    0.008    7.816    7.816
 collocate_single_gaussian          272 10.0    0.039    0.042    7.516    7.777
 array2fm                             1  7.0    0.000    0.000    6.912    7.245
 pw_scatter_s                      2720 13.7    4.420    4.567    4.420    4.567
 pw_gather_s                       2722 13.2    3.882    4.266    3.882    4.266
 array2fm_buffer_send                 1  8.0    2.988    3.146    2.988    3.146
 pw_poisson_solve                   545 10.5    1.103    1.146    2.179    2.377
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="10", plot="h2o_32_ri_rpa_mp2", label="RI-RPA (8n/2r/6t)", y=111.857932, yerr=0.000000
PlotPoint: name="11", plot="h2o_32_ri_rpa_mp2_mem", label="RI-RPA (8n/2r/6t)", y=2809.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/02/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         0.000000E+00       0.0%      0.0%      0.0%
 flops max/rank                      0.000000E+00       0.0%      0.0%      0.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                                  0       0.0%      0.0%      0.0%
 number of processed stacks                     0       0.0%      0.0%      0.0%
 average stack size                                     0.0       0.0       0.0
 marketing flops                     0.000000E+00
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                1                     12.
 MP_Allreduce           19                     21.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               22                 205321.
 MP_Allreduce          424                      9.
 MP_Sync                 4
 MP_comm_split           1
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.030    0.039  401.026  401.027
 farming_run                          1  2.0  399.672  399.685  400.975  400.980
 -------------------------------------------------------------------------------


 @@@@@@@@@@ Run number: 2 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32             16777216       0.0%      0.0%    100.0%
 flops    14 x    32 x    32            565182464       0.0%      0.0%    100.0%
 flops    29 x    32 x    32            585367552       0.0%      0.0%    100.0%
 flops    14 x    14 x    32            626196480       0.0%      0.0%    100.0%
 flops    29 x    14 x    32            638582784       0.0%      0.0%    100.0%
 flops    14 x    29 x    32            638582784       0.0%      0.0%    100.0%
 flops    29 x    29 x    32            682057728       0.0%      0.0%    100.0%
 flops    14 x    32 x    14         897827128576       0.0%      0.0%    100.0%
 flops    29 x    32 x    14         929989394432       0.0%      0.0%    100.0%
 flops    14 x    32 x    29         929989394432       0.0%      0.0%    100.0%
 flops    29 x    32 x    29         963203301376       0.0%      0.0%    100.0%
 flops    32 x    32 x    14        1693481172992       0.0%      0.0%    100.0%
 flops    32 x    32 x    29        1753962643456       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         7.172206E+12       0.0%      0.0%    100.0%
 flops max/rank                    150.696064E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          249788821       0.0%      0.0%    100.0%
 number of processed stacks                 98736       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    2529.9
 marketing flops                     7.174951E+12
 -------------------------------------------------------------------------------
 # multiplications                           1140
 max memory usage/rank               1.227526E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   61440
 MPI messages size (bytes):
  total size                         6.073508E+09
  min size                           0.000000E+00
  max size                         642.960000E+03
  average size                      98.852664E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               32004                        0
       128 < size <=     8192                1820                 14909440
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072               18640               1081442304
    131072 < size <=  4194304                8976               4977156096
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         1003                     44.
 MP_Alltoall          1797                 713538.
 MP_ISend             3686                  54943.
 MP_IRecv             3622                  54292.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group               12
 MP_Bcast              757                 478553.
 MP_Allreduce         2021                  21391.
 MP_Sync                37
 MP_Alltoall            77               28402590.
 MP_SendRecv          2876                2171486.
 MP_ISendRecv         1034                 172620.
 MP_Wait              1346
 MP_comm_split           7
 MP_ISend              264                 362227.
 MP_IRecv              264                 362718.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.011    0.031  209.809  209.810
 qs_energies                          1  2.0    0.000    0.000  209.551  209.573
 scf_env_do_scf                       1  3.0    0.000    0.000  106.505  106.505
 qs_ks_update_qs_env                  5  5.0    0.000    0.000  105.464  105.473
 rebuild_ks_matrix                    4  6.0    0.000    0.000  105.463  105.472
 qs_ks_build_kohn_sham_matrix         4  7.0    0.059    0.068  105.463  105.472
 hfx_ks_matrix                        4  8.0    0.001    0.001  105.014  105.018
 integrate_four_center                4  9.0    0.144    0.461  105.013  105.018
 mp2_main                             1  3.0    0.000    0.000  102.717  102.739
 mp2_gpw_main                         1  4.0    0.031    0.044  101.681  101.704
 integrate_four_center_main           4 10.0    0.095    0.512   96.574   99.164
 integrate_four_center_bin          265 11.0   96.480   99.159   96.480   99.159
 init_scf_loop                        1  4.0    0.000    0.000   92.304   92.305
 mp2_ri_gpw_compute_in                1  5.0    0.064    0.064   74.575   75.706
 mp2_ri_gpw_compute_in_loop           1  6.0    0.002    0.002   54.099   55.230
 mp2_eri_3c_integrate_gpw            91  7.0    0.144    0.159   41.652   46.809
 integrate_v_rspace                  95  8.0    0.398    0.570   28.087   33.089
 pw_transfer                       2240 10.6    0.144    0.173   29.982   30.443
 fft_wrap_pw1pw2                   1868 11.4    0.018    0.022   28.985   29.386
 ao_to_mo_and_store_B_mult_1         91  7.0   10.773   28.995   10.773   28.995
 mp2_ri_gpw_compute_en                1  5.0    0.057    0.075   26.958   28.797
 grid_integrate_task_list            95  9.0   23.313   28.506   23.313   28.506
 fft_wrap_pw1pw2_100                730 12.4    1.289    1.474   26.672   27.038
 mp2_ri_gpw_compute_en_RI_loop        1  6.0    1.834    1.907   25.184   25.193
 get_2c_integrals                     1  6.0    0.000    0.000   20.389   20.413
 compute_2c_integrals                 1  7.0    0.002    0.003   19.377   19.379
 compute_2c_integrals_loop_lm         1  8.0    0.001    0.001   18.917   19.216
 mp2_eri_2c_integrate_gpw             1  9.0    1.738    1.893   18.916   19.215
 fft3d_s                           1823 13.4   18.426   18.930   18.439   18.943
 scf_env_do_scf_inner_loop            4  4.0    0.000    0.000   14.199   14.199
 calculate_wavefunction              91  8.0    2.013    2.042    9.745   10.002
 mp2_ri_gpw_compute_en_expansio     172  7.0    0.558    0.594    8.823    9.463
 potential_pw2rs                    186 10.0    0.033    0.035    8.711    9.325
 local_gemm                         172  8.0    8.265    8.890    8.265    8.890
 mp2_ri_gpw_compute_en_comm          22  7.0    0.496    0.516    8.133    8.838
 mp2_eri_2c_integrate_gpw_pot_l      91 10.0    0.001    0.001    8.274    8.631
 calc_potential_gpw                 182  9.5    0.002    0.002    7.931    8.155
 collocate_single_gaussian           91 10.0    0.017    0.022    7.902    8.102
 mp_sendrecv_dm3                   2068  8.0    6.173    6.850    6.173    6.850
 mp2_ri_gpw_compute_en_ener         172  7.0    6.349    6.433    6.349    6.433
 pw_gather_s                        912 13.2    4.928    5.568    4.928    5.568
 mp_sync                             37 10.5    3.073    5.158    3.073    5.158
 pw_scatter_s                       910 13.7    3.935    4.287    3.935    4.287
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="20", plot="h2o_32_ri_rpa_mp2", label="RI-MP2 (8n/6r/2t)", y=101.676428, yerr=0.000000
PlotPoint: name="21", plot="h2o_32_ri_rpa_mp2_mem", label="RI-MP2 (8n/6r/2t)", y=1514.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/03/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     29.277748E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               5055360       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      29.1
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             451.518464E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 9436608
 MPI messages size (bytes):
  total size                       333.233553E+09
  min size                           0.000000E+00
  max size                         315.840000E+03
  average size                      35.312852E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             4913240                        0
       128 < size <=     8192             1155432               9465298944
      8192 < size <=    32768             1984512              54190407680
     32768 < size <=   131072              551296              42776657920
    131072 < size <=  4194304              832128             226802306368
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3683                  62379.
 MP_Allreduce        10329                    270.
 MP_Sync               530
 MP_Alltoall          2083
 MP_SendRecv         22610                   5520.
 MP_ISendRecv        22610                   5520.
 MP_Wait             37876
 MP_comm_split          50
 MP_ISend            20771                  42672.
 MP_IRecv            20771                  42672.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.013    0.040   57.673   57.674
 qs_mol_dyn_low                       1  2.0    0.004    0.012   56.895   57.188
 qs_forces                           11  3.9    0.003    0.004   56.463   56.464
 qs_energies                         11  4.9    0.001    0.002   54.856   54.871
 scf_env_do_scf                      11  5.9    0.001    0.004   48.083   48.083
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.007   45.685   45.685
 qs_scf_new_mos                     108  7.5    0.000    0.001   34.504   34.812
 qs_scf_loop_do_ot                  108  8.5    0.000    0.001   34.503   34.812
 dbcsr_multiply_generic            2286 12.5    0.094    0.098   34.156   34.564
 ot_scf_mini                        108  9.5    0.002    0.002   32.774   32.964
 velocity_verlet                     10  3.0    0.003    0.010   28.133   28.143
 multiply_cannon                   2286 13.5    0.186    0.195   26.271   28.021
 multiply_cannon_loop              2286 14.5    1.480    1.567   25.372   27.104
 ot_mini                            108 10.5    0.001    0.001   19.535   19.781
 qs_ot_get_derivative               108 11.5    0.001    0.001   16.516   16.692
 mp_waitall_1                    245248 16.5    8.780   14.952    8.780   14.952
 multiply_cannon_metrocomm3       54864 15.5    0.068    0.073    5.984   12.962
 multiply_cannon_multrec          54864 15.5    4.238    6.537    7.764   11.216
 qs_ot_get_p                        119 10.4    0.001    0.001    8.539    8.817
 rebuild_ks_matrix                  119  8.3    0.000    0.000    8.497    8.620
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.032    8.497    8.620
 qs_ks_update_qs_env                119  7.6    0.001    0.001    7.468    7.584
 mp_sum_l                          7287 12.8    5.345    7.157    5.345    7.157
 multiply_cannon_sync_h2d         54864 15.5    5.869    6.905    5.869    6.905
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    5.832    6.261
 qs_ot_p2m_diag                      50 11.0    0.004    0.006    5.625    5.662
 init_scf_run                        11  5.9    0.000    0.001    5.409    5.410
 scf_env_initial_rho_setup           11  6.9    0.000    0.000    5.409    5.410
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    5.316    5.402
 sum_up_and_integrate               119 10.3    0.012    0.014    4.976    4.996
 integrate_v_rspace                 119 11.3    0.002    0.003    4.964    4.985
 qs_rho_update_rho_low              119  7.7    0.000    0.001    4.866    4.981
 calculate_rho_elec                 119  8.7    0.012    0.017    4.866    4.980
 dbcsr_mm_accdrv_process          76910 16.1    1.129    1.788    3.447    4.806
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    4.790    4.790
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    4.501    4.503
 cp_fm_redistribute_end              50 14.0    2.292    4.464    2.304    4.470
 cp_fm_diag_elpa_base                50 14.0    2.155    4.334    2.162    4.345
 rs_pw_transfer                     974 11.9    0.012    0.013    3.276    3.404
 calculate_dm_sparse                119  9.5    0.000    0.001    3.114    3.276
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.935    3.140
 apply_single                       119 13.6    0.000    0.000    2.934    3.140
 calculate_first_density_matrix       1  7.0    0.000    0.001    2.967    2.974
 jit_kernel_multiply                 13 15.8    2.259    2.931    2.259    2.931
 density_rs2pw                      119  9.7    0.004    0.004    2.754    2.861
 multiply_cannon_metrocomm1       54864 15.5    0.053    0.058    1.777    2.790
 ot_diis_step                       108 11.5    0.006    0.006    2.742    2.743
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.638    2.640
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    2.469    2.536
 init_scf_loop                       11  6.9    0.000    0.000    2.369    2.370
 wfi_extrapolate                     11  7.9    0.001    0.001    2.360    2.361
 pw_transfer                       1439 11.6    0.052    0.057    2.241    2.347
 potential_pw2rs                    119 12.3    0.004    0.004    2.276    2.325
 make_m2s                          4572 13.5    0.053    0.055    2.234    2.321
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.262    2.313
 fft_wrap_pw1pw2                   1201 12.6    0.007    0.007    2.166    2.273
 make_images                       4572 14.5    0.133    0.138    2.152    2.238
 acc_transpose_blocks             54864 15.5    0.228    0.250    1.749    2.207
 grid_integrate_task_list           119 12.3    2.016    2.138    2.016    2.138
 mp_sum_d                          4135 12.0    1.393    1.976    1.393    1.976
 fft3d_ps                          1201 14.6    0.373    0.484    1.808    1.919
 fft_wrap_pw1pw2_140                487 13.2    0.185    0.203    1.646    1.753
 mp_alltoall_d11v                  2130 13.8    1.556    1.689    1.556    1.689
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.567    1.591
 grid_collocate_task_list           119  9.7    1.351    1.440    1.351    1.440
 mp_waitany                       12084 13.8    1.298    1.426    1.298    1.426
 prepare_preconditioner              11  7.9    0.000    0.000    1.199    1.241
 make_preconditioner                 11  8.9    0.000    0.000    1.199    1.241
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.139    1.189
 dbcsr_dot_sd                      1205 11.9    0.049    0.060    0.786    1.188
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="100", plot="h2o_64_md", label="(8n/12r/1t)", y=57.674000, yerr=0.000000
PlotPoint: name="101", plot="h2o_64_md_mem", label="(8n/12r/1t)", y=430.545455, yerr=0.655555
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/04/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     57.173320E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3066240       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      47.9
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             487.514112E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2194560
 MPI messages size (bytes):
  total size                       310.646604E+09
  min size                           0.000000E+00
  max size                           1.145520E+06
  average size                     141.553031E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              724648                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              281952               4619501568
     32768 < size <=   131072              494448              39143342080
    131072 < size <=  4194304              440000             264807943488
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3672                  62658.
 MP_Allreduce        10306                    303.
 MP_Sync                54
 MP_Alltoall          2060                 913371.
 MP_SendRecv         16779                  37093.
 MP_ISendRecv        16779                  37093.
 MP_Wait             23539
 MP_comm_split          50
 MP_ISend             5720                 128509.
 MP_IRecv             5720                 128509.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.019    0.060   44.341   44.343
 qs_mol_dyn_low                       1  2.0    0.003    0.004   43.600   43.642
 qs_forces                           11  3.9    0.002    0.003   43.299   43.299
 qs_energies                         11  4.9    0.009    0.059   41.507   41.517
 scf_env_do_scf                      11  5.9    0.001    0.002   35.409   35.412
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.007   32.547   32.547
 dbcsr_multiply_generic            2286 12.5    0.102    0.105   23.529   24.125
 qs_scf_new_mos                     108  7.5    0.001    0.001   22.219   22.477
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   22.218   22.477
 ot_scf_mini                        108  9.5    0.003    0.004   21.265   21.440
 velocity_verlet                     10  3.0    0.001    0.001   20.365   20.371
 multiply_cannon                   2286 13.5    0.212    0.227   17.434   19.359
 multiply_cannon_loop              2286 14.5    0.901    0.975   16.112   18.245
 ot_mini                            108 10.5    0.001    0.001   13.079   13.319
 mp_waitall_1                    200699 16.5    6.793   12.381    6.793   12.381
 qs_ot_get_derivative               108 11.5    0.001    0.001   10.501   10.675
 multiply_cannon_metrocomm3       27432 15.5    0.067    0.069    4.616   10.490
 multiply_cannon_multrec          27432 15.5    1.980    4.440    6.287    9.367
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.883    8.064
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    7.883    8.063
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.973    7.137
 dbcsr_mm_accdrv_process          47894 16.0    3.165    5.504    4.237    6.319
 qs_ot_get_p                        119 10.4    0.001    0.001    5.126    5.373
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    3.934    5.118
 sum_up_and_integrate               119 10.3    0.024    0.028    4.816    4.823
 mp_sum_l                          7287 12.8    2.732    4.815    2.732    4.815
 integrate_v_rspace                 119 11.3    0.002    0.003    4.791    4.800
 init_scf_run                        11  5.9    0.000    0.001    4.781    4.781
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    4.781    4.781
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    3.448    4.654
 apply_single                       119 13.6    0.000    0.000    3.448    4.654
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.428    4.481
 calculate_rho_elec                 119  8.7    0.021    0.024    4.428    4.481
 rs_pw_transfer                     974 11.9    0.010    0.011    3.558    4.065
 qs_ot_p2m_diag                      50 11.0    0.009    0.013    3.464    3.492
 make_m2s                          4572 13.5    0.052    0.054    2.949    3.277
 density_rs2pw                      119  9.7    0.004    0.004    2.689    3.225
 make_images                       4572 14.5    0.201    0.238    2.860    3.186
 calculate_first_density_matrix       1  7.0    0.004    0.032    3.103    3.105
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.981    2.982
 init_scf_loop                       11  6.9    0.001    0.004    2.836    2.839
 multiply_cannon_sync_h2d         27432 15.5    2.145    2.729    2.145    2.729
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.587    2.589
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    2.564    2.565
 jit_kernel_multiply                 11 16.0    1.021    2.555    1.021    2.555
 cp_fm_redistribute_end              50 14.0    1.297    2.525    1.302    2.528
 potential_pw2rs                    119 12.3    0.006    0.006    2.492    2.518
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.408    2.498
 ot_diis_step                       108 11.5    0.011    0.011    2.496    2.497
 cp_fm_diag_elpa_base                50 14.0    1.187    2.411    1.218    2.450
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.310    2.358
 calculate_dm_sparse                119  9.5    0.000    0.001    2.156    2.240
 pw_transfer                       1439 11.6    0.066    0.069    2.159    2.195
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    2.067    2.105
 acc_transpose_blocks             27432 15.5    0.110    0.114    1.242    1.929
 make_images_data                  4572 15.5    0.045    0.051    1.314    1.921
 grid_integrate_task_list           119 12.3    1.840    1.917    1.840    1.917
 prepare_preconditioner              11  7.9    0.000    0.000    1.830    1.858
 make_preconditioner                 11  8.9    0.000    0.001    1.830    1.858
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.714    1.770
 hybrid_alltoall_any               4725 16.4    0.051    0.111    1.137    1.687
 fft3d_ps                          1201 14.6    0.520    0.580    1.617    1.653
 mp_allgather_i34                  2286 14.5    0.756    1.621    0.756    1.621
 wfi_extrapolate                     11  7.9    0.001    0.001    1.614    1.614
 fft_wrap_pw1pw2_140                487 13.2    0.203    0.214    1.507    1.547
 mp_alltoall_d11v                  2130 13.8    1.387    1.515    1.387    1.515
 grid_collocate_task_list           119  9.7    1.270    1.396    1.270    1.396
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.295    1.340
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.318    1.332
 acc_transpose_blocks_kernels     27432 16.5    0.182    0.269    0.694    1.294
 mp_sum_d                          4135 12.0    0.709    1.143    0.709    1.143
 rs_pw_transfer_RS2PW_140           130 11.5    0.169    0.190    0.596    1.125
 rs_pw_transfer_PW2RS_50            119 14.3    0.583    0.602    0.970    1.118
 make_images_sizes                 4572 15.5    0.005    0.005    0.802    1.112
 mp_alltoall_i44                   4572 16.5    0.797    1.107    0.797    1.107
 mp_waitany                        5720 13.7    0.537    1.065    0.537    1.065
 jit_kernel_transpose                 5 15.5    0.512    1.025    0.512    1.025
 qs_energies_init_hamiltonians       11  5.9    0.000    0.002    0.994    0.994
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.969    0.985
 mp_alltoall_z22v                  1201 16.6    0.824    0.907    0.824    0.907
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="102", plot="h2o_64_md", label="(8n/6r/2t)", y=44.343000, yerr=0.000000
PlotPoint: name="103", plot="h2o_64_md_mem", label="(8n/6r/2t)", y=464.454545, yerr=1.157084
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/05/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                     59.051995E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3143552       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      46.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             521.035776E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  950976
 MPI messages size (bytes):
  total size                       203.844256E+09
  min size                           0.000000E+00
  max size                           1.638400E+06
  average size                     214.352688E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192              253512               2076770304
      8192 < size <=    32768              179424               2939682816
     32768 < size <=   131072              181440              14863564800
    131072 < size <=  4194304              330176             183964913216
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63490.
 MP_Allreduce        10155                    305.
 MP_Sync                54
 MP_Alltoall          1821                1607886.
 MP_SendRecv         11067                  57667.
 MP_ISendRecv        11067                  57667.
 MP_Wait             21987
 MP_ISend             9880                  92618.
 MP_IRecv             9880                  92618.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.121    0.456   37.795   37.796
 qs_mol_dyn_low                       1  2.0    0.004    0.037   36.645   36.658
 qs_forces                           11  3.9    0.006    0.034   36.548   36.550
 qs_energies                         11  4.9    0.003    0.015   34.846   34.850
 scf_env_do_scf                      11  5.9    0.001    0.002   28.958   28.958
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   25.950   25.950
 dbcsr_multiply_generic            2286 12.5    0.097    0.106   17.921   18.014
 velocity_verlet                     10  3.0    0.006    0.010   17.083   17.086
 qs_scf_new_mos                     108  7.5    0.001    0.001   16.501   16.523
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   16.500   16.522
 ot_scf_mini                        108  9.5    0.003    0.004   15.736   15.747
 multiply_cannon                   2286 13.5    0.196    0.202   13.860   14.600
 multiply_cannon_loop              2286 14.5    0.636    0.665   12.963   13.747
 ot_mini                            108 10.5    0.001    0.001    9.377    9.397
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.828    7.838
 multiply_cannon_multrec          18288 15.5    1.957    2.904    7.333    7.642
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.075    7.091
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    7.075    7.091
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.256    6.272
 dbcsr_mm_accdrv_process          38222 16.0    4.131    6.086    5.291    6.143
 mp_waitall_1                    158411 16.6    3.402    4.630    3.402    4.630
 sum_up_and_integrate               119 10.3    0.032    0.032    4.601    4.605
 integrate_v_rspace                 119 11.3    0.003    0.003    4.569    4.577
 init_scf_run                        11  5.9    0.000    0.001    4.475    4.475
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    4.475    4.475
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.224    4.233
 calculate_rho_elec                 119  8.7    0.031    0.031    4.224    4.232
 qs_ot_get_p                        119 10.4    0.001    0.001    4.078    4.096
 rs_pw_transfer                     974 11.9    0.009    0.010    3.381    3.619
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.936    3.591
 calculate_first_density_matrix       1  7.0    0.003    0.016    3.052    3.053
 init_scf_loop                       11  6.9    0.007    0.031    2.985    2.986
 density_rs2pw                      119  9.7    0.004    0.004    2.674    2.927
 qs_ot_p2m_diag                      50 11.0    0.012    0.013    2.807    2.812
 make_m2s                          4572 13.5    0.044    0.045    2.389    2.556
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.507    2.507
 make_images                       4572 14.5    0.192    0.203    2.305    2.470
 jit_kernel_multiply                 10 15.8    1.109    2.458    1.109    2.458
 multiply_cannon_metrocomm3       18288 15.5    0.044    0.045    1.434    2.452
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.109    2.446
 apply_single                       119 13.6    0.000    0.000    2.109    2.446
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.350    2.354
 potential_pw2rs                    119 12.3    0.007    0.008    2.314    2.331
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    2.178    2.188
 cp_fm_diag_elpa_base                50 14.0    2.148    2.164    2.175    2.185
 pw_transfer                       1439 11.6    0.066    0.070    2.153    2.167
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.109    2.121
 prepare_preconditioner              11  7.9    0.000    0.000    2.088    2.090
 make_preconditioner                 11  8.9    0.000    0.001    2.088    2.090
 mp_sum_l                          7287 12.8    1.605    2.084    1.605    2.084
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    2.060    2.074
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.923    2.016
 calculate_dm_sparse                119  9.5    0.000    0.001    1.981    2.003
 grid_integrate_task_list           119 12.3    1.801    1.895    1.801    1.895
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.764    1.770
 multiply_cannon_sync_h2d         18288 15.5    1.394    1.595    1.394    1.595
 fft3d_ps                          1201 14.6    0.528    0.544    1.548    1.571
 ot_diis_step                       108 11.5    0.011    0.011    1.522    1.522
 fft_wrap_pw1pw2_140                487 13.2    0.253    0.261    1.505    1.519
 grid_collocate_task_list           119  9.7    1.242    1.385    1.242    1.385
 wfi_extrapolate                     11  7.9    0.001    0.001    1.365    1.365
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.327    1.332
 acc_transpose_blocks             18288 15.5    0.076    0.077    1.276    1.292
 make_images_data                  4572 15.5    0.045    0.049    1.043    1.286
 hybrid_alltoall_any               4725 16.4    0.055    0.115    0.886    1.081
 qs_energies_init_hamiltonians       11  5.9    0.001    0.003    1.026    1.028
 mp_alltoall_d11v                  2130 13.8    0.865    1.006    0.865    1.006
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    0.946    0.971
 cp_fm_cholesky_invert               11 10.9    0.912    0.916    0.912    0.916
 mp_waitany                        9880 13.7    0.631    0.905    0.631    0.905
 mp_alltoall_z22v                  1201 16.6    0.844    0.905    0.844    0.905
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.893    0.896
 rs_pw_transfer_RS2PW_140           130 11.5    0.121    0.125    0.615    0.868
 make_images_sizes                 4572 15.5    0.005    0.005    0.608    0.859
 mp_alltoall_i44                   4572 16.5    0.603    0.854    0.603    0.854
 acc_transpose_blocks_kernels     18288 16.5    0.210    0.217    0.825    0.835
 mp_sendrecv_dv                   11067 12.7    0.791    0.806    0.791    0.806
 dbcsr_complete_redistribute        329 12.2    0.105    0.169    0.656    0.787
 rs_pw_transfer_PW2RS_50            119 14.3    0.399    0.410    0.720    0.768
 mp_allgather_i34                  2286 14.5    0.356    0.760    0.356    0.760
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="104", plot="h2o_64_md", label="(8n/4r/3t)", y=37.796000, yerr=0.000000
PlotPoint: name="105", plot="h2o_64_md_mem", label="(8n/4r/3t)", y=495.636364, yerr=1.871933
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/06/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    114.044384E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               3805952       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0      38.6
 marketing flops                     2.107592E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             562.028544E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1042416
 MPI messages size (bytes):
  total size                       150.443262E+09
  min size                           0.000000E+00
  max size                           1.188816E+06
  average size                     144.321719E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              228256                        0
       128 < size <=     8192              126888               1039466496
      8192 < size <=    32768              191472               3137077248
     32768 < size <=   131072              295800              25899827200
    131072 < size <=  4194304              200000             120367247040
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63489.
 MP_Allreduce        10154                    346.
 MP_Sync                54
 MP_Alltoall          1582                2412273.
 MP_SendRecv          8211                  74133.
 MP_ISendRecv         8211                  74133.
 MP_Wait             16271
 MP_ISend             7280                 135929.
 MP_IRecv             7280                 135929.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.018    0.027   38.893   38.894
 qs_mol_dyn_low                       1  2.0    0.003    0.004   38.645   38.653
 qs_forces                           11  3.9    0.002    0.003   38.587   38.587
 qs_energies                         11  4.9    0.001    0.001   36.776   36.782
 scf_env_do_scf                      11  5.9    0.000    0.001   30.974   30.976
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   27.176   27.177
 dbcsr_multiply_generic            2286 12.5    0.100    0.104   19.438   19.584
 velocity_verlet                     10  3.0    0.002    0.002   19.374   19.376
 qs_scf_new_mos                     108  7.5    0.001    0.001   17.688   17.747
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   17.687   17.746
 ot_scf_mini                        108  9.5    0.002    0.003   16.723   16.774
 multiply_cannon                   2286 13.5    0.222    0.255   15.263   15.756
 multiply_cannon_loop              2286 14.5    0.942    0.975   14.234   14.599
 ot_mini                            108 10.5    0.001    0.001    9.921    9.985
 multiply_cannon_multrec          27432 15.5    2.475    3.203    9.075    9.538
 qs_ot_get_derivative               108 11.5    0.001    0.001    8.057    8.110
 dbcsr_mm_accdrv_process          47916 15.9    5.675    7.727    6.510    7.781
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.220    7.287
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    7.220    7.286
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.418    6.476
 init_scf_run                        11  5.9    0.000    0.001    4.421    4.421
 scf_env_initial_rho_setup           11  6.9    0.000    0.000    4.421    4.421
 sum_up_and_integrate               119 10.3    0.036    0.039    4.349    4.358
 integrate_v_rspace                 119 11.3    0.003    0.003    4.313    4.322
 qs_ot_get_p                        119 10.4    0.001    0.001    4.081    4.156
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.109    4.137
 calculate_rho_elec                 119  8.7    0.040    0.046    4.108    4.137
 init_scf_loop                       11  6.9    0.000    0.000    3.776    3.776
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.920    3.316
 mp_waitall_1                    137007 16.6    2.497    3.048    2.497    3.048
 rs_pw_transfer                     974 11.9    0.009    0.010    2.813    3.011
 prepare_preconditioner              11  7.9    0.000    0.000    2.863    2.872
 make_preconditioner                 11  8.9    0.000    0.000    2.863    2.872
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.868    2.870
 make_m2s                          4572 13.5    0.054    0.055    2.667    2.802
 make_full_inverse_cholesky          11  9.9    0.000    0.000    2.460    2.792
 make_images                       4572 14.5    0.270    0.333    2.561    2.696
 density_rs2pw                      119  9.7    0.004    0.004    2.466    2.659
 qs_ot_p2m_diag                      50 11.0    0.015    0.023    2.594    2.603
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.317    2.318
 pw_transfer                       1439 11.6    0.066    0.070    2.235    2.276
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.252    2.252
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.070    2.204
 apply_single                       119 13.6    0.000    0.000    2.070    2.203
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    2.143    2.187
 calculate_dm_sparse                119  9.5    0.000    0.000    2.065    2.140
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.118    2.136
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    2.097    2.124
 potential_pw2rs                    119 12.3    0.008    0.009    2.053    2.065
 jit_kernel_multiply                  9 16.1    0.775    1.971    0.775    1.971
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.923    1.932
 grid_integrate_task_list           119 12.3    1.833    1.930    1.833    1.930
 cp_fm_diag_elpa_base                50 14.0    1.879    1.896    1.920    1.929
 ot_diis_step                       108 11.5    0.012    0.012    1.821    1.822
 fft_wrap_pw1pw2_140                487 13.2    0.288    0.302    1.696    1.742
 acc_transpose_blocks             27432 15.5    0.111    0.113    1.640    1.735
 fft3d_ps                          1201 14.6    0.558    0.605    1.581    1.610
 wfi_extrapolate                     11  7.9    0.001    0.001    1.485    1.485
 mp_sum_l                          7287 12.8    1.078    1.397    1.078    1.397
 grid_collocate_task_list           119  9.7    1.248    1.382    1.248    1.382
 multiply_cannon_metrocomm3       27432 15.5    0.038    0.039    0.804    1.285
 make_images_data                  4572 15.5    0.044    0.048    1.073    1.273
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.235    1.246
 cp_fm_upper_to_full                 72 14.2    0.830    1.184    0.830    1.184
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.157    1.173
 hybrid_alltoall_any               4725 16.4    0.062    0.151    0.883    1.157
 dbcsr_complete_redistribute        329 12.2    0.120    0.145    0.877    1.154
 acc_transpose_blocks_kernels     27432 16.5    0.269    0.277    1.000    1.102
 qs_energies_init_hamiltonians       11  5.9    0.000    0.001    1.091    1.092
 multiply_cannon_sync_h2d         27432 15.5    0.991    1.042    0.991    1.042
 mp_alltoall_d11v                  2130 13.8    0.889    1.016    0.889    1.016
 cp_fm_cholesky_invert               11 10.9    0.967    0.971    0.967    0.971
 mp_alltoall_z22v                  1201 16.6    0.907    0.944    0.907    0.944
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.893    0.899
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    0.628    0.899
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.000    0.798    0.874
 jit_kernel_transpose                 5 15.6    0.732    0.839    0.732    0.839
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="106", plot="h2o_64_md", label="(8n/3r/4t)", y=38.894000, yerr=0.000000
PlotPoint: name="107", plot="h2o_64_md_mem", label="(8n/3r/4t)", y=533.363636, yerr=4.333227
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/07/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    117.977176E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1384136       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     106.2
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             608.817152E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  219456
 MPI messages size (bytes):
  total size                        97.042514E+09
  min size                           0.000000E+00
  max size                           3.276800E+06
  average size                     442.195750E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              101892               3336634368
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304              116112              93705670464
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8156                     20.
 MP_Alltoall          8655                  64935.
 MP_ISend            36532                 168375.
 MP_IRecv            36532                 168349.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63488.
 MP_Allreduce        10154                    346.
 MP_Sync                54
 MP_Alltoall          1582                3682667.
 MP_SendRecv          5355                  94533.
 MP_ISendRecv         5355                  94533.
 MP_Wait             11335
 MP_ISend             5200                 225425.
 MP_IRecv             5200                 225425.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.012    0.031   31.542   31.542
 qs_mol_dyn_low                       1  2.0    0.003    0.003   31.321   31.329
 qs_forces                           11  3.9    0.002    0.002   31.263   31.264
 qs_energies                         11  4.9    0.001    0.001   29.483   29.486
 scf_env_do_scf                      11  5.9    0.000    0.001   24.121   24.121
 scf_env_do_scf_inner_loop          108  6.5    0.002    0.006   21.282   21.283
 velocity_verlet                     10  3.0    0.002    0.002   15.883   15.886
 dbcsr_multiply_generic            2286 12.5    0.100    0.105   13.595   13.691
 qs_scf_new_mos                     108  7.5    0.001    0.001   12.440   12.473
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   12.439   12.472
 ot_scf_mini                        108  9.5    0.002    0.002   11.705   11.731
 multiply_cannon                   2286 13.5    0.231    0.240   10.419   11.146
 multiply_cannon_loop              2286 14.5    0.330    0.341    9.429    9.641
 rebuild_ks_matrix                  119  8.3    0.000    0.000    6.472    6.497
 qs_ks_build_kohn_sham_matrix       119  9.3    0.012    0.013    6.472    6.497
 ot_mini                            108 10.5    0.001    0.001    6.443    6.474
 multiply_cannon_multrec           9144 15.5    1.604    1.858    6.086    6.358
 qs_ks_update_qs_env                119  7.6    0.001    0.001    5.778    5.801
 qs_ot_get_derivative               108 11.5    0.001    0.001    5.072    5.098
 dbcsr_mm_accdrv_process          12550 15.8    3.161    4.093    4.381    4.457
 sum_up_and_integrate               119 10.3    0.037    0.042    4.048    4.053
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.019    4.026
 calculate_rho_elec                 119  8.7    0.060    0.061    4.018    4.025
 integrate_v_rspace                 119 11.3    0.003    0.003    4.011    4.016
 init_scf_run                        11  5.9    0.000    0.001    3.849    3.849
 scf_env_initial_rho_setup           11  6.9    0.000    0.000    3.849    3.849
 qs_ot_get_p                        119 10.4    0.001    0.001    3.353    3.394
 init_scf_loop                       11  6.9    0.000    0.000    2.815    2.817
 mp_waitall_1                    115863 16.7    2.242    2.803    2.242    2.803
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.543    2.544
 density_rs2pw                      119  9.7    0.004    0.004    2.290    2.439
 rs_pw_transfer                     974 11.9    0.008    0.009    2.272    2.420
 make_m2s                          4572 13.5    0.034    0.035    2.198    2.383
 make_images                       4572 14.5    0.268    0.302    2.109    2.294
 pw_transfer                       1439 11.6    0.066    0.069    2.270    2.288
 qs_ot_p2m_diag                      50 11.0    0.022    0.023    2.244    2.248
 fft_wrap_pw1pw2                   1201 12.6    0.008    0.008    2.176    2.194
 jit_kernel_multiply                 10 15.6    1.183    2.142    1.183    2.142
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    2.013    2.014
 prepare_preconditioner              11  7.9    0.000    0.000    1.995    2.001
 make_preconditioner                 11  8.9    0.000    0.000    1.995    2.001
 grid_integrate_task_list           119 12.3    1.862    1.940    1.862    1.940
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    1.911    1.913
 calculate_dm_sparse                119  9.5    0.000    0.000    1.885    1.906
 make_full_inverse_cholesky          11  9.9    0.000    0.000    1.865    1.900
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    1.780    1.795
 fft_wrap_pw1pw2_140                487 13.2    0.366    0.377    1.729    1.748
 potential_pw2rs                    119 12.3    0.010    0.011    1.728    1.737
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    1.721    1.731
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.657    1.664
 cp_fm_diag_elpa_base                50 14.0    1.625    1.645    1.655    1.662
 fft3d_ps                          1201 14.6    0.563    0.575    1.508    1.528
 grid_collocate_task_list           119  9.7    1.305    1.428    1.305    1.428
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.362    1.374
 ot_diis_step                       108 11.5    0.012    0.013    1.345    1.346
 hybrid_alltoall_any               4725 16.4    0.063    0.177    0.964    1.325
 make_images_data                  4572 15.5    0.039    0.042    1.015    1.319
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    1.253    1.261
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    1.250    1.250
 wfi_extrapolate                     11  7.9    0.001    0.001    1.243    1.243
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    1.210    1.239
 apply_single                       119 13.6    0.000    0.000    1.210    1.238
 mp_alltoall_d11v                  2130 13.8    0.926    1.059    0.926    1.059
 acc_transpose_blocks              9144 15.5    0.038    0.039    1.037    1.051
 cp_fm_cholesky_invert               11 10.9    1.025    1.028    1.025    1.028
 build_core_hamiltonian_matrix_      11  4.9    0.000    0.001    0.872    0.926
 mp_alltoall_z22v                  1201 16.6    0.819    0.858    0.819    0.858
 mp_allgather_i34                  2286 14.5    0.323    0.836    0.323    0.836
 multiply_cannon_metrocomm1        9144 15.5    0.022    0.023    0.608    0.825
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.819    0.821
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    0.796    0.805
 acc_transpose_blocks_kernels      9144 16.5    0.117    0.120    0.792    0.802
 multiply_cannon_sync_h2d          9144 15.5    0.710    0.780    0.710    0.780
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    0.694    0.749
 make_images_sizes                 4572 15.5    0.005    0.005    0.498    0.725
 mp_alltoall_i44                   4572 16.5    0.494    0.720    0.494    0.720
 mp_sum_l                          7287 12.8    0.539    0.705    0.539    0.705
 jit_kernel_transpose                 5 15.6    0.675    0.685    0.675    0.685
 multiply_cannon_metrocomm3        9144 15.5    0.019    0.019    0.372    0.676
 dbcsr_complete_redistribute        329 12.2    0.170    0.183    0.626    0.656
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="108", plot="h2o_64_md", label="(8n/2r/6t)", y=31.542000, yerr=0.000000
PlotPoint: name="109", plot="h2o_64_md_mem", label="(8n/2r/6t)", y=574.909091, yerr=7.513074
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/08/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32          26877100032       0.0%      0.0%    100.0%
 flops     9 x     9 x    32          44168260608       0.0%      0.0%    100.0%
 flops    22 x     9 x    32          53835724800       0.0%      0.0%    100.0%
 flops     9 x    22 x    32          53885500416       0.0%      0.0%    100.0%
 flops    32 x    32 x     9          63568871424       0.0%      0.0%    100.0%
 flops    22 x    22 x    32          67007283200       0.0%      0.0%    100.0%
 flops    32 x    32 x    22          77695287296       0.0%      0.0%    100.0%
 flops     9 x    32 x    32          78422999040       0.0%      0.0%    100.0%
 flops    22 x    32 x    32          95850332160       0.0%      0.0%    100.0%
 flops     9 x    32 x     9         266263676928       0.0%      0.0%    100.0%
 flops    22 x    32 x     9         326697440256       0.0%      0.0%    100.0%
 flops     9 x    32 x    22         326697440256       0.0%      0.0%    100.0%
 flops    22 x    32 x    22         399918497792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                         1.880888E+12       0.0%      0.0%    100.0%
 flops max/rank                    235.585836E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          146984760       0.0%      0.0%    100.0%
 number of processed stacks               1388964       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     105.8
 marketing flops                     2.107587E+12
 -------------------------------------------------------------------------------
 # multiplications                           2286
 max memory usage/rank             750.735360E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   91440
 MPI messages size (bytes):
  total size                        85.748679E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     937.758938E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               21148                692256768
     32768 < size <=   131072               19224               1259864064
    131072 < size <=  4194304               41040              21941452800
   4194304 < size <= 16777216                9456              61855174464
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3622                  63723.
 MP_Allreduce        10154                    429.
 MP_Sync                54
 MP_Alltoall          1582                7383731.
 MP_SendRecv          2499                 189067.
 MP_ISendRecv         2499                 189067.
 MP_Wait              6399
 MP_ISend             3120                 546875.
 MP_IRecv             3120                 546875.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.022    0.052   47.507   47.508
 qs_mol_dyn_low                       1  2.0    0.003    0.005   46.451   46.460
 qs_forces                           11  3.9    0.020    0.022   46.391   46.392
 qs_energies                         11  4.9    0.001    0.001   44.308   44.313
 scf_env_do_scf                      11  5.9    0.001    0.001   36.046   36.047
 scf_env_do_scf_inner_loop          108  6.5    0.003    0.006   27.860   27.861
 velocity_verlet                     10  3.0    0.002    0.002   24.773   24.778
 dbcsr_multiply_generic            2286 12.5    0.100    0.103   20.818   21.284
 multiply_cannon                   2286 13.5    0.300    0.308   16.150   17.571
 qs_scf_new_mos                     108  7.5    0.001    0.001   17.410   17.500
 qs_scf_loop_do_ot                  108  8.5    0.001    0.001   17.409   17.499
 multiply_cannon_loop              2286 14.5    0.344    0.350   14.790   16.392
 ot_scf_mini                        108  9.5    0.002    0.002   16.264   16.367
 multiply_cannon_multrec           9144 15.5    3.509    5.197    9.439    9.979
 ot_mini                            108 10.5    0.001    0.001    9.747    9.887
 init_scf_loop                       11  6.9    0.000    0.000    8.158    8.159
 rebuild_ks_matrix                  119  8.3    0.000    0.000    7.729    7.908
 qs_ks_build_kohn_sham_matrix       119  9.3    0.013    0.013    7.728    7.907
 qs_ot_get_derivative               108 11.5    0.001    0.001    7.651    7.758
 dbcsr_mm_accdrv_process          12550 15.8    4.775    6.505    5.806    7.611
 prepare_preconditioner              11  7.9    0.000    0.000    7.145    7.160
 make_preconditioner                 11  8.9    0.000    0.000    7.145    7.160
 qs_ks_update_qs_env                119  7.6    0.001    0.001    6.955    7.119
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.725    7.028
 init_scf_run                        11  5.9    0.000    0.001    6.182    6.182
 scf_env_initial_rho_setup           11  6.9    0.000    0.000    6.182    6.182
 mp_waitall_1                     94719 16.7    3.963    6.027    3.963    6.027
 qs_rho_update_rho_low              119  7.7    0.001    0.001    4.635    4.770
 calculate_rho_elec                 119  8.7    0.118    0.121    4.634    4.769
 calculate_first_density_matrix       1  7.0    0.000    0.000    4.566    4.586
 cp_fm_upper_to_full                 72 14.2    3.172    4.527    3.172    4.527
 sum_up_and_integrate               119 10.3    0.064    0.066    4.371    4.377
 integrate_v_rspace                 119 11.3    0.003    0.003    4.306    4.313
 multiply_cannon_metrocomm3        9144 15.5    0.019    0.020    2.361    4.015
 qs_ot_get_p                        119 10.4    0.001    0.001    3.620    3.775
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.463    3.465
 qs_ot_get_derivative_taylor         59 13.0    0.001    0.001    2.763    3.342
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.004    3.156
 calculate_dm_sparse                119  9.5    0.000    0.000    3.056    3.109
 pw_transfer                       1439 11.6    0.069    0.069    2.972    2.982
 dbcsr_complete_redistribute        329 12.2    0.358    0.367    2.103    2.908
 make_m2s                          4572 13.5    0.037    0.038    2.705    2.891
 fft_wrap_pw1pw2                   1201 12.6    0.009    0.009    2.874    2.883
 make_images                       4572 14.5    0.353    0.385    2.585    2.772
 apply_preconditioner_dbcsr         119 12.6    0.000    0.000    2.376    2.683
 apply_single                       119 13.6    0.000    0.000    2.376    2.682
 copy_fm_to_dbcsr                   176 11.2    0.001    0.001    1.763    2.560
 density_rs2pw                      119  9.7    0.004    0.004    2.479    2.501
 mp_sum_l                          7287 12.8    1.434    2.392    1.434    2.392
 fft_wrap_pw1pw2_140                487 13.2    0.616    0.622    2.366    2.379
 qs_ot_p2m_diag                      50 11.0    0.043    0.044    2.223    2.225
 mp_alltoall_i22                    627 13.8    1.507    2.213    1.507    2.213
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.414    2.209
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.000    0.000    2.074    2.103
 grid_integrate_task_list           119 12.3    2.069    2.085    2.069    2.085
 ot_diis_step                       108 11.5    0.014    0.014    2.048    2.049
 cp_dbcsr_syevd                      50 12.0    0.003    0.003    1.926    1.927
 qs_ot_get_derivative_diag           49 12.0    0.001    0.001    1.864    1.914
 fft3d_ps                          1201 14.6    0.596    0.606    1.891    1.898
 rs_pw_transfer                     974 11.9    0.009    0.009    1.862    1.885
 jit_kernel_multiply                  6 15.7    1.004    1.799    1.004    1.799
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    1.788    1.796
 potential_pw2rs                    119 12.3    0.014    0.015    1.709    1.713
 cp_fm_cholesky_invert               11 10.9    1.694    1.697    1.694    1.697
 cp_fm_diag_elpa                     50 13.0    0.000    0.000    1.602    1.602
 cp_fm_diag_elpa_base                50 14.0    1.449    1.502    1.600    1.600
 copy_dbcsr_to_fm                   153 11.3    0.002    0.002    0.835    1.564
 wfi_extrapolate                     11  7.9    0.001    0.001    1.527    1.527
 grid_collocate_task_list           119  9.7    1.515    1.523    1.515    1.523
 hybrid_alltoall_any               4725 16.4    0.087    0.147    1.253    1.508
 make_images_data                  4572 15.5    0.043    0.045    1.217    1.490
 mp_alltoall_d11v                  2130 13.8    1.262    1.391    1.262    1.391
 qs_ot_get_orbitals                 108 10.5    0.000    0.000    1.274    1.311
 acc_transpose_blocks              9144 15.5    0.038    0.039    1.211    1.226
 mp_alltoall_z22v                  1201 16.6    1.159    1.185    1.159    1.185
 dbcsr_desymmetrize_deep            153 12.3    0.044    0.047    0.438    1.171
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    1.116    1.133
 multiply_cannon_sync_h2d          9144 15.5    1.044    1.053    1.044    1.053
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    0.986    1.035
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    0.986    1.001
 acc_transpose_blocks_kernels      9144 16.5    0.117    0.119    0.955    0.967
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="110", plot="h2o_64_md", label="(8n/1r/12t)", y=47.508000, yerr=0.000000
PlotPoint: name="111", plot="h2o_64_md_mem", label="(8n/1r/12t)", y=703.636364, yerr=14.398691
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/09/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    198.287135E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               8410880       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     117.0
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             501.653504E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 8483040
 MPI messages size (bytes):
  total size                         1.160510E+12
  min size                           0.000000E+00
  max size                           1.161504E+06
  average size                     136.803609E+03
 MPI breakdown and total messages size (bytes):
             size <=      128             1836752                        0
       128 < size <=     8192             1040592               8524529664
      8192 < size <=    32768             1486976              24362614784
     32768 < size <=   131072             2491776             216971345920
    131072 < size <=  4194304             1626944             910632720448
   4194304 < size <= 16777216                   0                        0
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66212.
 MP_Allreduce         9776                    488.
 MP_Sync                52
 MP_Alltoall          1938                1702427.
 MP_SendRecv         20900                   9096.
 MP_ISendRecv        20900                   9096.
 MP_Wait             37268
 MP_ISend            14300                  82312.
 MP_IRecv            14300                  82312.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.011    0.028   88.821   88.821
 qs_mol_dyn_low                       1  2.0    0.003    0.003   88.481   88.490
 qs_forces                           11  3.9    0.003    0.003   88.095   88.096
 qs_energies                         11  4.9    0.002    0.002   84.708   84.733
 scf_env_do_scf                      11  5.9    0.001    0.001   74.626   74.628
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   68.391   68.391
 dbcsr_multiply_generic            2055 12.4    0.107    0.110   52.438   52.797
 qs_scf_new_mos                      99  7.5    0.000    0.001   49.560   49.648
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   49.559   49.648
 ot_scf_mini                         99  9.5    0.002    0.002   47.111   47.192
 velocity_verlet                     10  3.0    0.030    0.219   46.217   46.246
 multiply_cannon                   2055 13.4    0.182    0.188   43.055   44.080
 multiply_cannon_loop              2055 14.4    1.524    1.559   41.878   42.744
 ot_mini                             99 10.5    0.001    0.001   27.402   27.490
 qs_ot_get_derivative                99 11.5    0.001    0.001   20.585   20.685
 multiply_cannon_multrec          49320 15.4   12.393   13.013   17.392   18.279
 rebuild_ks_matrix                  110  8.3    0.000    0.000   15.778   15.900
 qs_ks_build_kohn_sham_matrix       110  9.3    0.055    0.363   15.778   15.899
 qs_ks_update_qs_env                110  7.6    0.001    0.001   13.485   13.596
 mp_waitall_1                    220248 16.4   11.772   12.909   11.772   12.909
 qs_ot_get_p                        110 10.4    0.001    0.002   10.949   11.023
 multiply_cannon_sync_h2d         49320 15.4   10.242   10.769   10.242   10.769
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    8.042    8.552
 sum_up_and_integrate               110 10.3    0.036    0.043    7.894    8.159
 integrate_v_rspace                 110 11.3    0.003    0.003    7.857    8.129
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    7.230    7.879
 apply_single                       110 13.6    0.000    0.000    7.230    7.878
 multiply_cannon_metrocomm3       49320 15.4    0.079    0.082    6.600    7.690
 qs_rho_update_rho_low              110  7.6    0.001    0.001    7.558    7.680
 calculate_rho_elec                 110  8.6    0.021    0.026    7.557    7.679
 init_scf_run                        11  5.9    0.000    0.001    7.557    7.557
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    7.556    7.557
 qs_ot_p2m_diag                      48 11.0    0.012    0.018    7.508    7.544
 ot_diis_step                        99 11.5    0.005    0.006    6.595    6.596
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    6.568    6.568
 init_scf_loop                       11  6.9    0.000    0.000    6.192    6.193
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    5.883    5.911
 cp_fm_diag_elpa_base                48 14.0    5.864    5.893    5.881    5.910
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    5.621    5.670
 rs_pw_transfer                     902 11.9    0.012    0.013    4.862    5.352
 mp_sum_l                          6594 12.7    4.510    5.231    4.510    5.231
 dbcsr_mm_accdrv_process          87628 16.1    1.927    2.005    4.874    5.196
 density_rs2pw                      110  9.6    0.004    0.005    4.244    4.865
 make_m2s                          4110 13.4    0.060    0.064    4.412    4.535
 make_images                       4110 14.4    0.178    0.191    4.316    4.441
 wfi_extrapolate                     11  7.9    0.001    0.001    4.330    4.330
 pw_transfer                       1331 11.6    0.056    0.067    3.868    4.197
 calculate_dm_sparse                110  9.5    0.001    0.001    4.034    4.128
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.009    3.779    4.109
 multiply_cannon_metrocomm1       49320 15.4    0.062    0.065    2.740    4.106
 prepare_preconditioner              11  7.9    0.000    0.000    3.933    3.949
 make_preconditioner                 11  8.9    0.000    0.000    3.933    3.949
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    3.793    3.798
 make_full_inverse_cholesky          11  9.9    0.000    0.000    3.685    3.741
 potential_pw2rs                    110 12.3    0.006    0.007    3.356    3.673
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    3.438    3.502
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.406    3.464
 fft_wrap_pw1pw2_140                451 13.1    0.452    0.501    3.080    3.420
 grid_integrate_task_list           110 12.3    3.232    3.401    3.232    3.401
 fft3d_ps                          1111 14.6    0.783    0.872    2.971    3.272
 calculate_first_density_matrix       1  7.0    0.000    0.001    3.096    3.099
 mp_alltoall_d11v                  2046 13.8    2.337    2.814    2.337    2.814
 jit_kernel_multiply                 13 15.9    2.673    2.772    2.673    2.772
 mp_waitany                       14300 13.8    2.076    2.750    2.076    2.750
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    2.426    2.450
 grid_collocate_task_list           110  9.6    2.155    2.448    2.155    2.448
 acc_transpose_blocks             49320 15.4    0.212    0.219    2.247    2.403
 cp_fm_cholesky_invert               11 10.9    2.223    2.227    2.223    2.227
 mp_alltoall_z22v                  1111 16.6    1.814    2.214    1.814    2.214
 make_images_data                  4110 15.4    0.043    0.046    1.918    2.124
 hybrid_alltoall_any               4261 16.3    0.083    0.481    1.651    1.979
 mp_sum_d                          3889 11.9    1.410    1.960    1.410    1.960
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.894    1.929
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="200", plot="h2o_128_md", label="(8n/12r/1t)", y=88.821000, yerr=0.000000
PlotPoint: name="201", plot="h2o_128_md_mem", label="(8n/12r/1t)", y=476.181818, yerr=2.979267
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/10/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    390.715586E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               5019072       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     196.1
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             587.923456E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1972800
 MPI messages size (bytes):
  total size                         1.077520E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     546.188250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192              222984               1826684928
      8192 < size <=    32768              520356              13399818240
     32768 < size <=   131072              372336              35386294272
    131072 < size <=  4194304              787758             788321309808
   4194304 < size <= 16777216               54450             238588003280
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66430.
 MP_Allreduce         9775                    566.
 MP_Sync                52
 MP_Alltoall          1717                3608167.
 MP_SendRecv         10340                  26400.
 MP_ISendRecv        10340                  26400.
 MP_Wait             22352
 MP_ISend            10164                 155761.
 MP_IRecv            10164                 155761.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.012    0.031   73.747   73.748
 qs_mol_dyn_low                       1  2.0    0.003    0.004   73.353   73.503
 qs_forces                           11  3.9    0.003    0.003   73.271   73.272
 qs_energies                         11  4.9    0.001    0.002   69.844   69.847
 scf_env_do_scf                      11  5.9    0.000    0.001   60.728   60.731
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   52.367   52.368
 dbcsr_multiply_generic            2055 12.4    0.117    0.120   39.257   39.414
 velocity_verlet                     10  3.0    0.001    0.002   38.106   38.108
 qs_scf_new_mos                      99  7.5    0.001    0.001   35.105   35.285
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   35.104   35.285
 ot_scf_mini                         99  9.5    0.003    0.003   33.410   33.601
 multiply_cannon                   2055 13.4    0.221    0.244   32.076   33.248
 multiply_cannon_loop              2055 14.4    0.920    0.941   30.653   31.536
 ot_mini                             99 10.5    0.001    0.001   19.373   19.552
 multiply_cannon_multrec          24660 15.4    7.676    9.360   14.478   16.329
 rebuild_ks_matrix                  110  8.3    0.000    0.000   14.334   14.488
 qs_ks_build_kohn_sham_matrix       110  9.3    0.012    0.014   14.334   14.487
 qs_ot_get_derivative                99 11.5    0.001    0.001   13.488   13.675
 qs_ks_update_qs_env                110  7.6    0.001    0.001   12.618   12.759
 mp_waitall_1                    176588 16.5    8.356   10.930    8.356   10.930
 init_scf_loop                       11  6.9    0.000    0.000    8.319    8.320
 multiply_cannon_sync_h2d         24660 15.4    7.042    8.296    7.042    8.296
 multiply_cannon_metrocomm3       24660 15.4    0.067    0.070    5.406    8.244
 qs_ot_get_p                        110 10.4    0.001    0.001    7.295    7.525
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    6.620    7.242
 apply_single                       110 13.6    0.000    0.001    6.620    7.242
 sum_up_and_integrate               110 10.3    0.052    0.058    7.087    7.098
 integrate_v_rspace                 110 11.3    0.002    0.003    7.035    7.048
 dbcsr_mm_accdrv_process          52282 16.1    4.735    5.910    6.640    6.952
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.748    6.760
 calculate_rho_elec                 110  8.6    0.040    0.048    6.747    6.760
 init_scf_run                        11  5.9    0.000    0.001    6.669    6.669
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    6.668    6.669
 prepare_preconditioner              11  7.9    0.000    0.000    6.227    6.245
 make_preconditioner                 11  8.9    0.000    0.000    6.227    6.245
 make_full_inverse_cholesky          11  9.9    0.000    0.000    5.793    5.961
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    5.087    5.898
 ot_diis_step                        99 11.5    0.010    0.010    5.834    5.834
 qs_ot_p2m_diag                      48 11.0    0.029    0.044    5.243    5.263
 make_m2s                          4110 13.4    0.057    0.060    4.678    5.190
 make_images                       4110 14.4    0.401    0.448    4.569    5.076
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.784    4.784
 density_rs2pw                      110  9.6    0.004    0.005    3.798    4.334
 rs_pw_transfer                     902 11.9    0.013    0.014    3.606    4.221
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    4.105    4.119
 pw_transfer                       1331 11.6    0.067    0.075    3.948    4.116
 cp_fm_diag_elpa_base                48 14.0    4.049    4.071    4.102    4.115
 fft_wrap_pw1pw2                   1111 12.6    0.009    0.009    3.841    4.010
 wfi_extrapolate                     11  7.9    0.001    0.001    3.732    3.732
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.002    3.586    3.588
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    3.350    3.440
 grid_integrate_task_list           110 12.3    3.151    3.397    3.151    3.397
 fft_wrap_pw1pw2_140                451 13.1    0.522    0.538    3.152    3.321
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.201    3.249
 make_images_data                  4110 15.4    0.046    0.051    2.552    3.120
 hybrid_alltoall_any               4261 16.3    0.103    0.446    2.213    3.062
 calculate_dm_sparse                110  9.5    0.001    0.001    3.018    3.053
 cp_fm_cholesky_invert               11 10.9    3.018    3.025    3.018    3.025
 fft3d_ps                          1111 14.6    1.105    1.330    2.813    2.996
 potential_pw2rs                    110 12.3    0.008    0.009    2.820    2.851
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.822    2.825
 jit_kernel_multiply                 11 16.2    1.557    2.663    1.557    2.663
 grid_collocate_task_list           110  9.6    2.096    2.544    2.096    2.544
 mp_sum_l                          6594 12.7    1.874    2.484    1.874    2.484
 mp_alltoall_d11v                  2046 13.8    1.947    2.358    1.947    2.358
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    2.015    2.038
 mp_waitany                       10164 13.8    1.291    1.929    1.291    1.929
 qs_energies_init_hamiltonians       11  5.9    0.000    0.002    1.903    1.905
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.874    1.887
 mp_allgather_i34                  2055 14.4    0.752    1.849    0.752    1.849
 multiply_cannon_metrocomm4       22605 15.4    0.078    0.081    0.784    1.706
 acc_transpose_blocks             24660 15.4    0.112    0.115    1.548    1.624
 mp_irecv_dv                      57340 16.2    0.657    1.582    0.657    1.582
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.567    1.581
 cp_fm_cholesky_decompose            22 10.9    1.560    1.565    1.560    1.565
 rs_pw_transfer_RS2PW_140           121 11.5    0.208    0.219    0.993    1.564
 mp_alltoall_z22v                  1111 16.6    1.423    1.540    1.423    1.540
 dbcsr_complete_redistribute        325 12.2    0.237    0.290    1.250    1.526
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.002    1.386    1.491
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="202", plot="h2o_128_md", label="(8n/6r/2t)", y=73.748000, yerr=0.000000
PlotPoint: name="203", plot="h2o_128_md_mem", label="(8n/6r/2t)", y=555.000000, yerr=7.298817
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/11/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    404.681598E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               3346752       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     294.1
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             662.528000E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  854880
 MPI messages size (bytes):
  total size                       708.322787E+09
  min size                           0.000000E+00
  max size                           6.553600E+06
  average size                     828.564000E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768              222984               7302414336
     32768 < size <=   131072              153888              10085203968
    131072 < size <=  4194304              389376             200257044480
   4194304 < size <= 16777216               82208             490679162176
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66421.
 MP_Allreduce         9774                    562.
 MP_Sync                52
 MP_Alltoall          1496                4511006.
 MP_SendRecv          6820                  27424.
 MP_ISendRecv         6820                  27424.
 MP_Wait             25498
 MP_ISend            17072                 115022.
 MP_IRecv            17072                 115022.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.067    0.398   65.099   65.100
 qs_mol_dyn_low                       1  2.0    0.003    0.004   64.354   64.365
 qs_forces                           11  3.9    0.003    0.003   64.286   64.288
 qs_energies                         11  4.9    0.015    0.015   60.998   61.002
 scf_env_do_scf                      11  5.9    0.000    0.001   52.136   52.137
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   42.571   42.572
 velocity_verlet                     10  3.0    0.001    0.002   34.105   34.107
 dbcsr_multiply_generic            2055 12.4    0.117    0.124   30.103   30.379
 qs_scf_new_mos                      99  7.5    0.001    0.001   26.756   26.837
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   26.756   26.836
 ot_scf_mini                         99  9.5    0.002    0.003   25.507   25.612
 multiply_cannon                   2055 13.4    0.213    0.222   23.084   24.328
 multiply_cannon_loop              2055 14.4    0.616    0.630   21.835   22.828
 ot_mini                             99 10.5    0.001    0.001   14.510   14.612
 rebuild_ks_matrix                  110  8.3    0.000    0.000   12.835   12.971
 qs_ks_build_kohn_sham_matrix       110  9.3    0.012    0.014   12.835   12.971
 qs_ks_update_qs_env                110  7.6    0.001    0.001   11.285   11.410
 multiply_cannon_multrec          16440 15.4    4.019    4.995   10.399   11.318
 qs_ot_get_derivative                99 11.5    0.001    0.001   10.104   10.207
 mp_waitall_1                    139946 16.5    7.475   10.169    7.475   10.169
 init_scf_loop                       11  6.9    0.000    0.000    9.526    9.527
 prepare_preconditioner              11  7.9    0.000    0.000    7.394    7.410
 make_preconditioner                 11  8.9    0.000    0.001    7.394    7.410
 multiply_cannon_metrocomm3       16440 15.4    0.042    0.046    4.331    7.134
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.667    7.048
 sum_up_and_integrate               110 10.3    0.061    0.062    6.924    6.937
 integrate_v_rspace                 110 11.3    0.003    0.003    6.863    6.877
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.442    6.452
 calculate_rho_elec                 110  8.6    0.059    0.060    6.442    6.452
 dbcsr_mm_accdrv_process          34862 16.1    4.992    5.890    6.233    6.423
 init_scf_run                        11  5.9    0.000    0.001    6.388    6.388
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    6.388    6.388
 qs_ot_get_p                        110 10.4    0.001    0.001    6.108    6.257
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.966    5.362
 apply_single                       110 13.6    0.000    0.000    4.966    5.361
 make_m2s                          4110 13.4    0.050    0.051    4.592    4.997
 make_images                       4110 14.4    0.395    0.514    4.477    4.882
 density_rs2pw                      110  9.6    0.004    0.005    3.461    4.626
 qs_ot_p2m_diag                      48 11.0    0.042    0.044    4.385    4.390
 ot_diis_step                        99 11.5    0.011    0.011    4.377    4.378
 multiply_cannon_sync_h2d         16440 15.4    3.725    4.330    3.725    4.330
 rs_pw_transfer                     902 11.9    0.011    0.012    3.179    4.312
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.469    4.117
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.023    4.024
 pw_transfer                       1331 11.6    0.066    0.073    3.877    3.894
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.769    3.789
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.555    3.556
 grid_integrate_task_list           110 12.3    3.160    3.424    3.160    3.424
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.392    3.403
 cp_fm_diag_elpa_base                48 14.0    3.314    3.355    3.390    3.401
 make_images_data                  4110 15.4    0.043    0.047    2.602    3.227
 fft_wrap_pw1pw2_140                451 13.1    0.640    0.648    3.166    3.185
 calculate_first_density_matrix       1  7.0    0.000    0.000    3.174    3.176
 hybrid_alltoall_any               4261 16.3    0.106    0.378    2.318    3.124
 wfi_extrapolate                     11  7.9    0.001    0.001    3.110    3.110
 cp_fm_cholesky_invert               11 10.9    3.056    3.062    3.056    3.062
 mp_sum_l                          6594 12.7    2.125    2.869    2.125    2.869
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.750    2.789
 potential_pw2rs                    110 12.3    0.011    0.011    2.609    2.628
 fft3d_ps                          1111 14.6    1.092    1.104    2.587    2.601
 calculate_dm_sparse                110  9.5    0.001    0.001    2.551    2.585
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.462    2.525
 mp_alltoall_d11v                  2046 13.8    1.951    2.512    1.951    2.512
 grid_collocate_task_list           110  9.6    2.130    2.499    2.130    2.499
 dbcsr_complete_redistribute        325 12.2    0.788    0.830    2.020    2.494
 mp_waitany                       17072 13.8    1.250    2.475    1.250    2.475
 multiply_cannon_metrocomm4       14385 15.4    0.045    0.048    0.851    2.226
 mp_irecv_dv                      48980 15.7    0.782    2.106    0.782    2.106
 rs_pw_transfer_RS2PW_140           121 11.5    0.179    0.183    0.934    2.089
 jit_kernel_multiply                  8 16.5    0.856    2.069    0.856    2.069
 qs_energies_init_hamiltonians       11  5.9    0.000    0.002    1.998    2.000
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    1.522    1.980
 cp_fm_upper_to_full                 70 14.2    1.403    1.815    1.403    1.815
 cp_fm_cholesky_decompose            22 10.9    1.764    1.786    1.764    1.786
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.746    1.759
 mp_allgather_i34                  2055 14.4    0.569    1.658    0.569    1.658
 acc_transpose_blocks             16440 15.4    0.073    0.076    1.307    1.540
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.488    1.500
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.002    1.365    1.482
 rs_gather_matrices                 110 12.3    0.232    0.260    1.015    1.481
 mp_alltoall_z22v                  1111 16.6    1.284    1.303    1.284    1.303
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="204", plot="h2o_128_md", label="(8n/4r/3t)", y=65.100000, yerr=0.000000
PlotPoint: name="205", plot="h2o_128_md_mem", label="(8n/4r/3t)", y=627.272727, yerr=8.853257
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/12/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    601.317074E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               4916280       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     200.2
 marketing flops                    15.646302E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             736.563200E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  937080
 MPI messages size (bytes):
  total size                       523.723932E+09
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     558.889250E+03
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                 264                  2162688
      8192 < size <=    32768              304932               8165326848
     32768 < size <=   131072              110640               6338641920
    131072 < size <=  4194304              489498             400769458320
   4194304 < size <= 16777216               24750             108449092400
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66419.
 MP_Allreduce         9774                    603.
 MP_Sync                52
 MP_Alltoall          1496                5863162.
 MP_SendRecv          5060                  43184.
 MP_ISendRecv         5060                  43184.
 MP_Wait             20042
 MP_ISend            13376                 163145.
 MP_IRecv            13376                 163145.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.017    0.031   69.666   69.668
 qs_mol_dyn_low                       1  2.0    0.003    0.004   69.330   69.339
 qs_forces                           11  3.9    0.003    0.004   69.134   69.135
 qs_energies                         11  4.9    0.016    0.064   65.280   65.283
 scf_env_do_scf                      11  5.9    0.011    0.055   56.235   56.250
 scf_env_do_scf_inner_loop           99  6.5    0.003    0.006   43.757   43.757
 velocity_verlet                     10  3.0    0.001    0.002   38.660   38.667
 dbcsr_multiply_generic            2055 12.4    0.113    0.118   30.381   30.576
 qs_scf_new_mos                      99  7.5    0.001    0.001   28.012   28.157
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   28.011   28.157
 ot_scf_mini                         99  9.5    0.003    0.004   26.365   26.478
 multiply_cannon                   2055 13.4    0.244    0.258   22.877   23.982
 multiply_cannon_loop              2055 14.4    0.888    0.913   21.477   22.019
 ot_mini                             99 10.5    0.001    0.001   14.683   14.821
 multiply_cannon_multrec          24660 15.4    4.247    6.910   12.834   14.116
 rebuild_ks_matrix                  110  8.3    0.000    0.000   13.025   13.129
 qs_ks_build_kohn_sham_matrix       110  9.3    0.060    0.375   13.025   13.129
 init_scf_loop                       11  6.9    0.002    0.006   12.425   12.448
 qs_ks_update_qs_env                110  7.6    0.001    0.001   11.179   11.270
 prepare_preconditioner              11  7.9    0.000    0.000   10.553   10.570
 make_preconditioner                 11  8.9    0.001    0.004   10.553   10.570
 qs_ot_get_derivative                99 11.5    0.001    0.001   10.251   10.372
 make_full_inverse_cholesky          11  9.9    0.000    0.000    8.753   10.238
 dbcsr_mm_accdrv_process          52304 16.0    7.258    8.633    8.431    9.397
 sum_up_and_integrate               110 10.3    0.068    0.071    6.879    6.895
 integrate_v_rspace                 110 11.3    0.003    0.004    6.811    6.825
 qs_ot_get_p                        110 10.4    0.001    0.001    6.588    6.728
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.547    6.566
 calculate_rho_elec                 110  8.6    0.078    0.082    6.546    6.565
 mp_waitall_1                    121746 16.5    4.721    6.498    4.721    6.498
 make_m2s                          4110 13.4    0.059    0.060    5.862    6.166
 init_scf_run                        11  5.9    0.000    0.001    6.164    6.165
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    6.164    6.165
 make_images                       4110 14.4    0.575    0.694    5.722    6.022
 cp_fm_upper_to_full                 70 14.2    3.357    4.890    3.357    4.890
 qs_ot_p2m_diag                      48 11.0    0.055    0.066    4.690    4.705
 ot_diis_step                        99 11.5    0.011    0.011    4.393    4.393
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.186    4.187
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.033    4.115
 apply_single                       110 13.6    0.000    0.000    4.033    4.115
 pw_transfer                       1331 11.6    0.066    0.076    3.975    4.017
 dbcsr_complete_redistribute        325 12.2    0.416    0.452    2.755    3.931
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.008    3.869    3.915
 density_rs2pw                      110  9.6    0.004    0.004    3.381    3.905
 rs_pw_transfer                     902 11.9    0.010    0.011    3.186    3.763
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.531    3.541
 cp_fm_diag_elpa_base                48 14.0    3.371    3.432    3.529    3.539
 grid_integrate_task_list           110 12.3    3.282    3.515    3.282    3.515
 calculate_dm_sparse                110  9.5    0.001    0.001    3.418    3.476
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.401    3.450
 make_images_data                  4110 15.4    0.045    0.049    2.971    3.385
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    2.173    3.328
 multiply_cannon_sync_h2d         24660 15.4    3.194    3.328    3.194    3.328
 fft_wrap_pw1pw2_140                451 13.1    0.668    0.692    3.238    3.289
 qs_ot_get_derivative_diag           47 12.0    0.001    0.002    3.174    3.242
 hybrid_alltoall_any               4261 16.3    0.121    0.460    2.468    3.227
 cp_fm_cholesky_invert               11 10.9    3.138    3.147    3.138    3.147
 wfi_extrapolate                     11  7.9    0.001    0.001    3.134    3.134
 multiply_cannon_metrocomm3       24660 15.4    0.036    0.038    1.362    3.132
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    1.791    2.934
 calculate_first_density_matrix       1  7.0    0.001    0.002    2.895    2.918
 mp_alltoall_i22                    605 13.7    1.715    2.880    1.715    2.880
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.812    2.814
 fft3d_ps                          1111 14.6    1.086    1.117    2.630    2.652
 potential_pw2rs                    110 12.3    0.013    0.013    2.471    2.494
 grid_collocate_task_list           110  9.6    2.224    2.490    2.224    2.490
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.439    2.473
 qs_energies_init_hamiltonians       11  5.9    0.001    0.002    2.309    2.310
 mp_alltoall_d11v                  2046 13.8    1.903    2.130    1.903    2.130
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    2.092    2.104
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    2.013    2.054
 mp_waitany                       13376 13.8    1.471    2.023    1.471    2.023
 cp_fm_cholesky_decompose            22 10.9    1.837    1.877    1.837    1.877
 acc_transpose_blocks             24660 15.4    0.104    0.107    1.781    1.802
 jit_kernel_multiply                  9 16.0    0.842    1.801    0.842    1.801
 qs_ot_get_orbitals                  99 10.5    0.001    0.001    1.746    1.776
 build_core_hamiltonian_matrix_      11  4.9    0.002    0.011    1.615    1.708
 mp_sum_l                          6594 12.7    0.965    1.599    0.965    1.599
 multiply_cannon_metrocomm4       20550 15.4    0.060    0.062    0.840    1.574
 mp_allgather_i34                  2055 14.4    0.500    1.553    0.500    1.553
 mp_irecv_dv                      62702 16.1    0.739    1.492    0.739    1.492
 rs_pw_transfer_RS2PW_140           121 11.5    0.172    0.178    0.869    1.413
 mp_alltoall_z22v                  1111 16.6    1.328    1.400    1.328    1.400
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="206", plot="h2o_128_md", label="(8n/3r/4t)", y=69.668000, yerr=0.000000
PlotPoint: name="207", plot="h2o_128_md_mem", label="(8n/3r/4t)", y=698.090909, yerr=9.404588
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/13/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                    807.299199E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1438408       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     684.2
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank             830.341120E+06
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  197280
 MPI messages size (bytes):
  total size                       339.125567E+09
  min size                           0.000000E+00
  max size                          13.107200E+06
  average size                       1.719006E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 132                  4325376
     32768 < size <=   131072               88656              11620319232
    131072 < size <=  4194304               89424             117209825280
   4194304 < size <= 16777216               17616             210291069504
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         7346                     33.
 MP_Alltoall          8043                 263767.
 MP_ISend            32836                 654203.
 MP_IRecv            32836                 654587.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3473                  66417.
 MP_Allreduce         9774                    644.
 MP_Sync                52
 MP_Alltoall          1496                8504061.
 MP_SendRecv          3300                  54848.
 MP_ISendRecv         3300                  54848.
 MP_Wait             13926
 MP_ISend             9240                 278857.
 MP_IRecv             9240                 278857.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.012    0.028   59.972   59.973
 qs_mol_dyn_low                       1  2.0    0.003    0.003   59.680   59.696
 qs_forces                           11  3.9    0.003    0.003   59.593   59.593
 qs_energies                         11  4.9    0.001    0.001   55.813   55.821
 scf_env_do_scf                      11  5.9    0.000    0.001   46.663   46.663
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   37.988   37.989
 velocity_verlet                     10  3.0    0.006    0.007   33.130   33.135
 dbcsr_multiply_generic            2055 12.4    0.104    0.108   23.830   23.971
 qs_scf_new_mos                      99  7.5    0.001    0.001   22.197   22.237
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   22.196   22.236
 ot_scf_mini                         99  9.5    0.002    0.002   20.914   20.930
 multiply_cannon                   2055 13.4    0.243    0.255   17.842   19.382
 multiply_cannon_loop              2055 14.4    0.322    0.336   16.417   16.823
 rebuild_ks_matrix                  110  8.3    0.000    0.000   12.360   12.384
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.013   12.360   12.383
 qs_ks_update_qs_env                110  7.6    0.001    0.001   10.947   10.964
 ot_mini                             99 10.5    0.001    0.001   10.952   10.959
 init_scf_loop                       11  6.9    0.000    0.000    8.621    8.623
 multiply_cannon_multrec           8220 15.4    3.251    4.546    7.614    8.589
 mp_waitall_1                    103326 16.6    6.578    8.364    6.578    8.364
 qs_ot_get_derivative                99 11.5    0.001    0.001    7.131    7.146
 prepare_preconditioner              11  7.9    0.000    0.000    6.924    6.926
 make_preconditioner                 11  8.9    0.000    0.000    6.924    6.926
 sum_up_and_integrate               110 10.3    0.079    0.081    6.856    6.872
 integrate_v_rspace                 110 11.3    0.003    0.003    6.776    6.792
 qs_rho_update_rho_low              110  7.6    0.001    0.001    6.738    6.750
 calculate_rho_elec                 110  8.6    0.114    0.115    6.737    6.750
 make_full_inverse_cholesky          11  9.9    0.000    0.000    6.478    6.568
 qs_ot_get_p                        110 10.4    0.001    0.001    5.923    5.936
 init_scf_run                        11  5.9    0.000    0.001    5.926    5.926
 scf_env_initial_rho_setup           11  6.9    0.000    0.001    5.926    5.926
 dbcsr_mm_accdrv_process          17442 15.9    2.944    3.957    4.233    5.122
 make_m2s                          4110 13.4    0.039    0.040    4.594    4.844
 make_images                       4110 14.4    0.637    0.699    4.464    4.711
 qs_ot_p2m_diag                      48 11.0    0.081    0.084    4.402    4.409
 pw_transfer                       1331 11.6    0.066    0.072    4.235    4.257
 fft_wrap_pw1pw2                   1111 12.6    0.008    0.009    4.127    4.155
 multiply_cannon_metrocomm3        8220 15.4    0.018    0.018    2.903    4.120
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.063    4.063
 ot_diis_step                        99 11.5    0.012    0.012    3.794    3.794
 density_rs2pw                      110  9.6    0.004    0.004    3.389    3.762
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    3.699    3.723
 apply_single                       110 13.6    0.000    0.000    3.699    3.723
 grid_integrate_task_list           110 12.3    3.352    3.535    3.352    3.535
 fft_wrap_pw1pw2_140                451 13.1    0.833    0.844    3.485    3.518
 cp_fm_cholesky_invert               11 10.9    3.470    3.474    3.470    3.474
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.410    3.417
 cp_fm_diag_elpa_base                48 14.0    3.346    3.374    3.408    3.415
 multiply_cannon_sync_h2d          8220 15.4    2.908    3.067    2.908    3.067
 hybrid_alltoall_any               4261 16.3    0.201    0.870    2.492    3.043
 make_images_data                  4110 15.4    0.038    0.043    2.573    3.030
 rs_pw_transfer                     902 11.9    0.010    0.010    2.591    2.972
 wfi_extrapolate                     11  7.9    0.001    0.001    2.922    2.922
 calculate_first_density_matrix       1  7.0    0.000    0.000    2.895    2.896
 calculate_dm_sparse                110  9.5    0.001    0.001    2.690    2.725
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    2.702    2.702
 fft3d_ps                          1111 14.6    1.140    1.166    2.673    2.695
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    2.619    2.620
 grid_collocate_task_list           110  9.6    2.319    2.574    2.319    2.574
 potential_pw2rs                    110 12.3    0.015    0.016    2.342    2.353
 mp_alltoall_d11v                  2046 13.8    1.862    2.270    1.862    2.270
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    2.239    2.246
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    2.058    2.071
 cp_fm_cholesky_decompose            22 10.9    1.986    2.008    1.986    2.008
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    1.778    1.986
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    1.969    1.978
 mp_allgather_i34                  2055 14.4    0.610    1.749    0.610    1.749
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    1.743    1.749
 dbcsr_complete_redistribute        325 12.2    0.554    0.578    1.551    1.653
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    1.523    1.646
 cp_dbcsr_plus_fm_fm_t_native        22  8.9    0.001    0.001    1.628    1.639
 multiply_cannon_metrocomm1        8220 15.4    0.021    0.022    1.100    1.612
 jit_kernel_multiply                  8 15.8    0.979    1.572    0.979    1.572
 mp_waitany                        9240 13.8    1.124    1.548    1.124    1.548
 qs_create_task_list                 11  7.9    0.000    0.001    1.225    1.332
 generate_qs_task_list               11  8.9    0.375    0.441    1.225    1.331
 rs_gather_matrices                 110 12.3    0.324    0.370    0.985    1.318
 mp_alltoall_z22v                  1111 16.6    1.298    1.310    1.298    1.310
 rs_pw_transfer_RS2PW_140           121 11.5    0.171    0.179    0.832    1.240
 copy_dbcsr_to_fm                   151 11.3    0.003    0.003    1.216    1.238
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="208", plot="h2o_128_md", label="(8n/2r/6t)", y=59.973000, yerr=0.000000
PlotPoint: name="209", plot="h2o_128_md_mem", label="(8n/2r/6t)", y=783.909091, yerr=8.596559
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/14/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32         184415158272       0.0%      0.0%    100.0%
 flops     9 x     9 x    32         269180485632       0.0%      0.0%    100.0%
 flops     9 x    22 x    32         349395425280       0.0%      0.0%    100.0%
 flops    22 x     9 x    32         350042406912       0.0%      0.0%    100.0%
 flops    22 x    22 x    32         453581815808       0.0%      0.0%    100.0%
 flops    32 x    32 x     9         465064427520       0.0%      0.0%    100.0%
 flops    32 x    32 x    22         568412078080       0.0%      0.0%    100.0%
 flops     9 x    32 x    32         572195340288       0.0%      0.0%    100.0%
 flops    22 x    32 x    32         699349860352       0.0%      0.0%    100.0%
 flops     9 x    32 x     9        1735942275072       0.0%      0.0%    100.0%
 flops    22 x    32 x     9        2216407818240       0.0%      0.0%    100.0%
 flops     9 x    32 x    22        2216407818240       0.0%      0.0%    100.0%
 flops    22 x    32 x    22        2803661053952       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        12.884056E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.612391E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                          984178160       0.0%      0.0%    100.0%
 number of processed stacks               1464624       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     672.0
 marketing flops                    15.646297E+12
 -------------------------------------------------------------------------------
 # multiplications                           2055
 max memory usage/rank               1.390391E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   82200
 MPI messages size (bytes):
  total size                       297.640985E+09
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       3.620936E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                  44                  1441792
     32768 < size <=   131072               18560               2432696320
    131072 < size <=  4194304               54216              84915781632
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            8808             210291069504
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3462                  67098.
 MP_Allreduce         9752                    812.
 MP_Sync                52
 MP_Alltoall          1474               16505187.
 MP_SendRecv          2310                 360267.
 MP_ISendRecv         2310                 360267.
 MP_Wait              5214
 MP_ISend             2420                1187840.
 MP_IRecv             2420                1187840.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.016    0.033   94.654   94.654
 qs_mol_dyn_low                       1  2.0    0.003    0.003   94.280   94.290
 qs_forces                           11  3.9    0.004    0.006   94.210   94.211
 qs_energies                         11  4.9    0.002    0.002   89.886   89.890
 scf_env_do_scf                      11  5.9    0.000    0.001   78.454   78.454
 velocity_verlet                     10  3.0    0.002    0.002   59.177   59.184
 scf_env_do_scf_inner_loop           99  6.5    0.002    0.006   48.462   48.464
 dbcsr_multiply_generic            2055 12.4    0.119    0.123   31.110   31.239
 init_scf_loop                       11  6.9    0.000    0.000   29.908   29.910
 qs_scf_new_mos                      99  7.5    0.001    0.001   29.148   29.165
 qs_scf_loop_do_ot                   99  8.5    0.001    0.001   29.147   29.164
 prepare_preconditioner              11  7.9    0.000    0.000   27.811   27.815
 make_preconditioner                 11  8.9    0.000    0.000   27.811   27.815
 ot_scf_mini                         99  9.5    0.002    0.002   27.321   27.337
 make_full_inverse_cholesky          11  9.9    0.000    0.000   21.878   27.266
 multiply_cannon                   2055 13.4    0.340    0.364   23.383   24.572
 multiply_cannon_loop              2055 14.4    0.341    0.343   21.421   22.198
 cp_fm_upper_to_full                 70 14.2   12.861   18.501   12.861   18.501
 ot_mini                             99 10.5    0.001    0.001   14.933   14.961
 rebuild_ks_matrix                  110  8.3    0.000    0.000   14.726   14.758
 qs_ks_build_kohn_sham_matrix       110  9.3    0.013    0.013   14.726   14.758
 qs_ks_update_qs_env                110  7.6    0.001    0.001   13.299   13.328
 dbcsr_complete_redistribute        325 12.2    1.103    1.145    7.754   11.120
 multiply_cannon_multrec           8220 15.4    4.789    5.023   10.308   10.538
 qs_ot_get_derivative                99 11.5    0.001    0.001   10.275   10.287
 copy_fm_to_dbcsr                   174 11.2    0.001    0.001    6.652   10.026
 mp_waitall_1                     84994 16.7    8.585    9.574    8.585    9.574
 transfer_fm_to_dbcsr                11  9.9    0.000    0.000    5.918    9.265
 mp_alltoall_i22                    605 13.7    5.551    8.902    5.551    8.902
 qs_rho_update_rho_low              110  7.6    0.001    0.001    8.355    8.392
 calculate_rho_elec                 110  8.6    0.224    0.224    8.354    8.391
 sum_up_and_integrate               110 10.3    0.150    0.152    7.780    7.792
 integrate_v_rspace                 110 11.3    0.004    0.004    7.629    7.640
 init_scf_run                        11  5.9    0.000    0.001    7.063    7.064
 scf_env_initial_rho_setup           11  6.9    0.000    0.000    7.063    7.063
 qs_ot_get_p                        110 10.4    0.001    0.001    6.933    6.957
 make_m2s                          4110 13.4    0.042    0.043    5.722    6.191
 cp_fm_cholesky_invert               11 10.9    6.130    6.135    6.130    6.135
 multiply_cannon_metrocomm3        8220 15.4    0.018    0.019    5.409    6.102
 make_images                       4110 14.4    0.879    0.931    5.533    6.000
 dbcsr_mm_accdrv_process          11614 15.7    3.290    3.614    5.374    5.781
 pw_transfer                       1331 11.6    0.075    0.076    5.503    5.514
 fft_wrap_pw1pw2                   1111 12.6    0.009    0.009    5.385    5.396
 apply_preconditioner_dbcsr         110 12.6    0.000    0.000    4.764    5.211
 apply_single                       110 13.6    0.000    0.000    4.764    5.210
 qs_ot_p2m_diag                      48 11.0    0.151    0.156    5.108    5.112
 fft_wrap_pw1pw2_140                451 13.1    1.336    1.341    4.607    4.618
 cp_dbcsr_syevd                      48 12.0    0.003    0.003    4.618    4.618
 ot_diis_step                        99 11.5    0.015    0.016    4.611    4.611
 density_rs2pw                      110  9.6    0.004    0.004    4.258    4.293
 multiply_cannon_sync_h2d          8220 15.4    3.951    3.952    3.951    3.952
 cp_fm_diag_elpa                     48 13.0    0.000    0.000    3.903    3.903
 cp_fm_diag_elpa_base                48 14.0    3.340    3.546    3.900    3.900
 cp_dbcsr_sm_fm_multiply             37  9.5    0.001    0.001    3.847    3.849
 make_images_data                  4110 15.4    0.041    0.044    3.081    3.767
 hybrid_alltoall_any               4261 16.3    0.256    0.552    3.095    3.762
 grid_integrate_task_list           110 12.3    3.694    3.758    3.694    3.758
 qs_energies_init_hamiltonians       11  5.9    0.005    0.012    3.720    3.721
 wfi_extrapolate                     11  7.9    0.001    0.001    3.642    3.642
 qs_ot_get_derivative_taylor         52 13.0    0.001    0.001    3.115    3.566
 fft3d_ps                          1111 14.6    1.300    1.307    3.317    3.335
 calculate_first_density_matrix       1  7.0    0.000    0.000    3.285    3.286
 calculate_dm_sparse                110  9.5    0.001    0.001    3.221    3.246
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    3.186    3.194
 rs_pw_transfer                     902 11.9    0.010    0.011    2.794    2.821
 potential_pw2rs                    110 12.3    0.021    0.022    2.769    2.779
 qs_ot_get_derivative_diag           47 12.0    0.001    0.001    2.711    2.716
 grid_collocate_task_list           110  9.6    2.682    2.693    2.682    2.693
 cp_fm_cholesky_decompose            22 10.9    2.410    2.436    2.410    2.436
 mp_alltoall_d11v                  2046 13.8    2.301    2.425    2.301    2.425
 qs_env_update_s_mstruct             11  6.9    0.000    0.000    2.218    2.277
 jit_kernel_multiply                 10 15.3    1.880    2.180    1.880    2.180
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    2.120    2.178
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    2.033    2.035
 qs_create_task_list                 11  7.9    0.001    0.002    1.903    1.952
 generate_qs_task_list               11  8.9    0.731    0.784    1.903    1.950
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="210", plot="h2o_128_md", label="(8n/1r/12t)", y=94.654000, yerr=0.000000
PlotPoint: name="211", plot="h2o_128_md_mem", label="(8n/1r/12t)", y=1238.363636, yerr=66.870666
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/15/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1430460020736       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1958505086976       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1986244964352       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1992000282624       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2753956716544       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4454954827776       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5444944789504       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5492290093056       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6712799002624       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11613089636352       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15239146475520       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15239146475520       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19911124992000       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        94.228663E+12       0.0%      0.0%    100.0%
 flops max/rank                      1.103326E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6806316384       0.0%      0.0%    100.0%
 number of processed stacks              12044928       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     565.1
 marketing flops                   145.647559E+12
 -------------------------------------------------------------------------------
 # multiplications                           2527
 max memory usage/rank             632.860672E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                10431456
 MPI messages size (bytes):
  total size                         4.526012E+12
  min size                           0.000000E+00
  max size                           4.537280E+06
  average size                     433.881156E+03
 MPI breakdown and total messages size (bytes):
             size <=      128               65736                        0
       128 < size <=     8192                1232                 10092544
      8192 < size <=    32768             3605352              96403587072
     32768 < size <=   131072             1305088              74666999808
    131072 < size <=  4194304             5190114            3200148350408
   4194304 < size <= 16777216              263934            1154745839544
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4024                  57450.
 MP_Allreduce        11139                    794.
 MP_Sync                87
 MP_Alltoall          2242                1884503.
 MP_SendRecv         24510                  18752.
 MP_ISendRecv        24510                  18752.
 MP_Wait             42802
 MP_ISend            16140                 108019.
 MP_IRecv            16140                 108019.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.015    0.032  220.136  220.137
 qs_mol_dyn_low                       1  2.0    0.003    0.004  219.609  219.622
 qs_forces                           11  3.9    0.005    0.005  219.434  219.435
 qs_energies                         11  4.9    0.001    0.002  213.690  213.709
 scf_env_do_scf                      11  5.9    0.001    0.001  194.568  194.572
 scf_env_do_scf_inner_loop          118  6.6    0.003    0.008  172.235  172.237
 velocity_verlet                     10  3.0    0.001    0.002  130.101  130.103
 qs_scf_new_mos                     118  7.6    0.001    0.001  129.810  129.995
 qs_scf_loop_do_ot                  118  8.6    0.001    0.001  129.810  129.994
 dbcsr_multiply_generic            2527 12.6    0.180    0.186  127.374  128.101
 ot_scf_mini                        118  9.6    0.003    0.003  123.131  123.338
 multiply_cannon                   2527 13.6    0.241    0.248  103.054  104.532
 multiply_cannon_loop              2527 14.6    2.111    2.159  100.632  101.802
 ot_mini                            118 10.6    0.001    0.001   67.469   67.677
 multiply_cannon_multrec          60648 15.6   33.404   35.117   42.241   43.561
 qs_ot_get_derivative               118 11.6    0.001    0.001   42.605   42.787
 rebuild_ks_matrix                  129  8.3    0.001    0.001   35.011   35.334
 qs_ks_build_kohn_sham_matrix       129  9.3    0.015    0.016   35.010   35.333
 qs_ot_get_p                        129 10.4    0.001    0.001   33.468   33.701
 mp_waitall_1                    269254 16.5   29.743   33.497   29.743   33.497
 qs_ks_update_qs_env                129  7.6    0.001    0.001   31.469   31.715
 multiply_cannon_sync_h2d         60648 15.6   27.510   29.137   27.510   29.137
 qs_ot_p2m_diag                      83 11.4    0.079    0.092   26.438   26.502
 apply_preconditioner_dbcsr         129 12.6    0.000    0.001   24.515   25.281
 apply_single                       129 13.6    0.001    0.001   24.515   25.281
 ot_diis_step                       118 11.6    0.008    0.008   24.639   24.640
 cp_dbcsr_syevd                      83 12.4    0.005    0.005   23.887   23.889
 init_scf_loop                       11  6.9    0.000    0.001   22.248   22.250
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   20.462   20.490
 cp_fm_diag_elpa_base                83 14.4   20.358   20.397   20.457   20.486
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002   19.949   20.125
 multiply_cannon_metrocomm3       60648 15.6    0.113    0.117   15.929   18.185
 prepare_preconditioner              11  7.9    0.000    0.000   17.532   17.579
 make_preconditioner                 11  8.9    0.000    0.000   17.532   17.579
 make_full_inverse_cholesky          11  9.9    0.000    0.000   16.760   16.970
 sum_up_and_integrate               129 10.3    0.090    0.110   15.350   15.366
 integrate_v_rspace                 129 11.3    0.004    0.005   15.260   15.278
 qs_rho_update_rho_low              129  7.7    0.001    0.001   15.113   15.262
 calculate_rho_elec                 129  8.7    0.046    0.065   15.113   15.262
 make_m2s                          5054 13.6    0.105    0.113   14.673   15.052
 make_images                       5054 14.6    0.409    0.429   14.495   14.885
 init_scf_run                        11  5.9    0.000    0.001   14.809   14.810
 scf_env_initial_rho_setup           11  6.9    0.000    0.001   14.809   14.809
 density_rs2pw                      129  9.7    0.006    0.007    8.235   11.568
 rs_pw_transfer                    1054 12.0    0.017    0.020    7.137   10.554
 cp_fm_cholesky_invert               11 10.9   10.148   10.156   10.148   10.156
 wfi_extrapolate                     11  7.9    0.001    0.001    9.347    9.347
 mp_sum_l                          8010 12.9    8.371    9.270    8.371    9.270
 calculate_dm_sparse                129  9.5    0.001    0.001    8.958    9.066
 dbcsr_mm_accdrv_process         125468 16.2    3.184    3.316    8.398    8.991
 pw_transfer                       1559 11.6    0.076    0.097    8.527    8.799
 qs_ot_get_derivative_taylor         41 13.0    0.001    0.001    8.611    8.714
 fft_wrap_pw1pw2                   1301 12.7    0.011    0.013    8.322    8.597
 make_images_data                  5054 15.6    0.067    0.072    7.161    8.216
 qs_ot_get_orbitals                 118 10.6    0.001    0.001    8.081    8.155
 multiply_cannon_metrocomm1       60648 15.6    0.090    0.093    6.289    8.059
 grid_integrate_task_list           129 12.3    7.123    7.720    7.123    7.720
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.003    7.674    7.688
 hybrid_alltoall_any               5240 16.5    0.294    2.292    6.216    7.567
 fft_wrap_pw1pw2_140                527 13.2    1.277    1.334    6.920    7.192
 fft3d_ps                          1301 14.7    2.157    2.793    6.073    6.470
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    6.275    6.364
 mp_alltoall_d11v                  2423 14.1    4.600    6.168    4.600    6.168
 mp_waitany                       16140 13.9    2.712    6.080    2.712    6.080
 grid_collocate_task_list           129  9.7    4.716    6.008    4.716    6.008
 potential_pw2rs                    129 12.3    0.009    0.011    5.533    5.650
 rs_pw_transfer_RS2PW_140           140 11.5    0.284    0.310    2.172    5.542
 calculate_first_density_matrix       1  7.0    0.000    0.000    5.269    5.279
 cp_fm_cholesky_decompose            22 10.9    5.176    5.189    5.176    5.189
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="400", plot="h2o_256_md", label="(8n/12r/1t)", y=220.137000, yerr=0.000000
PlotPoint: name="401", plot="h2o_256_md_mem", label="(8n/12r/1t)", y=598.181818, yerr=6.833317
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/16/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420239992832       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528891191296       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514751E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.183246E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755938624       0.0%      0.0%    100.0%
 number of processed stacks               5975232       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.7
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             833.294336E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 2406720
 MPI messages size (bytes):
  total size                         4.100942E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.703955E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               14916                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768               70860               2317615104
     32768 < size <=   131072              722992              55511613440
    131072 < size <=  4194304             1375664            1398181724160
   4194304 < size <= 16777216              154704            1463834332048
  16777216 < size                           67584            1181116006400
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             3992                  58357.
 MP_Allreduce        11058                    960.
 MP_Sync                87
 MP_Alltoall          1969                6106579.
 MP_SendRecv         12032                  47072.
 MP_ISendRecv        12032                  47072.
 MP_Wait             25916
 MP_ISend            11748                 212467.
 MP_IRecv            11748                 212467.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.030    0.075  199.404  199.405
 qs_mol_dyn_low                       1  2.0    0.003    0.004  198.790  198.823
 qs_forces                           11  3.9    0.053    0.207  198.258  198.260
 qs_energies                         11  4.9    0.006    0.028  191.074  191.086
 scf_env_do_scf                      11  5.9    0.001    0.003  174.274  174.285
 scf_env_do_scf_inner_loop          117  6.6    0.006    0.020  139.513  139.516
 velocity_verlet                     10  3.0    0.001    0.002  124.706  124.717
 dbcsr_multiply_generic            2507 12.6    0.188    0.193   98.167   99.409
 qs_scf_new_mos                     117  7.6    0.001    0.001   98.237   98.629
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   98.236   98.629
 ot_scf_mini                        117  9.6    0.004    0.005   93.409   93.861
 multiply_cannon                   2507 13.6    0.476    0.523   77.484   81.990
 multiply_cannon_loop              2507 14.6    1.240    1.284   74.160   76.570
 ot_mini                            117 10.6    0.001    0.001   50.485   50.893
 mp_waitall_1                    214728 16.6   24.986   38.908   24.986   38.908
 multiply_cannon_multrec          30084 15.6   22.116   26.979   31.733   37.114
 init_scf_loop                       11  6.9    0.002    0.005   34.659   34.661
 rebuild_ks_matrix                  128  8.3    0.001    0.001   34.025   34.487
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.020   34.025   34.486
 qs_ks_update_qs_env                128  7.6    0.001    0.001   30.537   30.963
 prepare_preconditioner              11  7.9    0.000    0.001   30.176   30.256
 make_preconditioner                 11  8.9    0.000    0.001   30.176   30.256
 make_full_inverse_cholesky          11  9.9    0.000    0.000   28.828   29.448
 qs_ot_get_derivative               117 11.6    0.001    0.002   28.510   28.954
 multiply_cannon_metrocomm3       30084 15.6    0.091    0.097   15.556   28.183
 qs_ot_get_p                        128 10.4    0.001    0.001   24.553   25.051
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   22.049   23.253
 apply_single                       128 13.6    0.001    0.001   22.048   23.253
 multiply_cannon_sync_h2d         30084 15.6   19.352   22.413   19.352   22.413
 ot_diis_step                       117 11.6    0.014    0.015   21.802   21.805
 qs_ot_p2m_diag                      83 11.4    0.190    0.224   19.561   19.603
 cp_dbcsr_syevd                      83 12.4    0.006    0.006   18.340   18.344
 cp_fm_cholesky_invert               11 10.9   17.780   17.793   17.780   17.793
 make_m2s                          5014 13.6    0.091    0.097   14.845   16.390
 make_images                       5014 14.6    1.164    1.351   14.633   16.180
 sum_up_and_integrate               128 10.3    0.116    0.131   15.278   15.308
 integrate_v_rspace                 128 11.3    0.004    0.004   15.162   15.194
 qs_rho_update_rho_low              128  7.7    0.001    0.001   14.838   14.869
 calculate_rho_elec                 128  8.7    0.089    0.106   14.837   14.869
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   14.762   14.798
 cp_fm_diag_elpa_base                83 14.4   14.472   14.607   14.756   14.786
 init_scf_run                        11  5.9    0.000    0.001   11.857   11.859
 scf_env_initial_rho_setup           11  6.9    0.000    0.001   11.857   11.859
 qs_ot_get_derivative_diag           77 12.4    0.002    0.003   11.300   11.628
 make_images_data                  5014 15.6    0.064    0.073    8.906   10.696
 multiply_cannon_metrocomm4       27577 15.6    0.104    0.117    3.754   10.615
 density_rs2pw                      128  9.7    0.006    0.007    8.064   10.500
 mp_irecv_dv                      69486 16.3    3.554   10.227    3.554   10.227
 hybrid_alltoall_any               5200 16.5    0.345    1.535    7.610   10.191
 dbcsr_mm_accdrv_process          62242 16.2    4.628    5.308    9.073    9.671
 pw_transfer                       1547 11.6    0.087    0.099    9.058    9.116
 rs_pw_transfer                    1046 11.9    0.015    0.017    6.657    9.075
 fft_wrap_pw1pw2                   1291 12.7    0.011    0.011    8.833    8.897
 wfi_extrapolate                     11  7.9    0.001    0.001    8.665    8.665
 grid_integrate_task_list           128 12.3    7.187    7.792    7.187    7.792
 fft_wrap_pw1pw2_140                523 13.2    1.337    1.359    7.618    7.703
 cp_fm_cholesky_decompose            22 10.9    7.447    7.523    7.447    7.523
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    6.325    7.081
 calculate_dm_sparse                128  9.5    0.001    0.001    6.518    6.661
 fft3d_ps                          1291 14.7    2.810    2.977    6.137    6.203
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.194    6.202
 mp_sum_l                          7950 12.9    4.084    5.924    4.084    5.924
 grid_collocate_task_list           128  9.7    4.784    5.918    4.784    5.918
 potential_pw2rs                    128 12.3    0.015    0.017    5.517    5.552
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    5.414    5.477
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    5.313    5.437
 mp_alltoall_d11v                  2415 14.1    4.533    5.275    4.533    5.275
 mp_allgather_i34                  2507 14.6    1.888    5.078    1.888    5.078
 mp_waitany                       11748 13.9    2.554    5.006    2.554    5.006
 rs_pw_transfer_RS2PW_140           139 11.5    0.355    0.377    2.120    4.527
 dbcsr_complete_redistribute        395 12.7    0.773    0.855    3.311    4.196
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="402", plot="h2o_256_md", label="(8n/6r/2t)", y=199.405000, yerr=0.000000
PlotPoint: name="403", plot="h2o_256_md_mem", label="(8n/6r/2t)", y=795.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/17/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420243808256       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528908111872       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514772E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.928533E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755942624       0.0%      0.0%    100.0%
 number of processed stacks               3984192       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1695.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank             950.857728E+06
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                 1042912
 MPI messages size (bytes):
  total size                         2.716210E+12
  min size                           0.000000E+00
  max size                          26.214400E+06
  average size                       2.604448E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6424                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 264                  8650752
     32768 < size <=   131072              281856              36943429632
    131072 < size <=  4194304              660064             996105256960
   4194304 < size <= 16777216               65632             931531401248
  16777216 < size                           28672             751619276800
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4020                  57952.
 MP_Allreduce        11127                    998.
 MP_Sync                87
 MP_Alltoall          1712                9388896.
 MP_SendRecv          7936                  75008.
 MP_ISendRecv         7936                  75008.
 MP_Wait             21820
 MP_ISend            11748                 275205.
 MP_IRecv            11748                 275205.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.017    0.036  187.031  187.032
 qs_mol_dyn_low                       1  2.0    0.003    0.004  186.292  186.565
 qs_forces                           11  3.9    0.004    0.004  185.731  185.736
 qs_energies                         11  4.9    0.002    0.005  178.968  178.980
 scf_env_do_scf                      11  5.9    0.001    0.002  162.408  162.409
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  124.370  124.371
 velocity_verlet                     10  3.0    0.155    0.180  119.987  119.998
 qs_scf_new_mos                     117  7.6    0.001    0.001   84.773   85.106
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   84.772   85.105
 dbcsr_multiply_generic            2507 12.6    0.179    0.185   83.493   84.661
 ot_scf_mini                        117  9.6    0.004    0.008   80.555   80.923
 multiply_cannon                   2507 13.6    0.502    0.521   62.323   67.485
 multiply_cannon_loop              2507 14.6    0.852    0.883   58.995   61.955
 ot_mini                            117 10.6    0.001    0.001   43.855   44.210
 init_scf_loop                       11  6.9    0.001    0.004   37.935   37.935
 mp_waitall_1                    170520 16.6   26.529   36.237   26.529   36.237
 prepare_preconditioner              11  7.9    0.000    0.000   33.790   33.834
 make_preconditioner                 11  8.9    0.000    0.001   33.790   33.834
 make_full_inverse_cholesky          11  9.9    0.000    0.000   31.259   32.772
 rebuild_ks_matrix                  128  8.3    0.001    0.001   31.620   32.100
 qs_ks_build_kohn_sham_matrix       128  9.3    0.016    0.019   31.620   32.099
 qs_ks_update_qs_env                128  7.6    0.001    0.001   28.448   28.893
 multiply_cannon_metrocomm3       20056 15.6    0.057    0.061   15.590   25.263
 multiply_cannon_multrec          20056 15.6   13.453   16.459   22.168   25.101
 qs_ot_get_derivative               117 11.6    0.001    0.002   23.979   24.333
 qs_ot_get_p                        128 10.4    0.001    0.001   21.751   22.170
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   19.923   20.833
 apply_single                       128 13.6    0.001    0.001   19.922   20.833
 ot_diis_step                       117 11.6    0.018    0.020   19.739   19.740
 qs_ot_p2m_diag                      83 11.4    0.265    0.273   17.259   17.289
 make_m2s                          5014 13.6    0.081    0.085   15.662   16.681
 make_images                       5014 14.6    1.181    1.293   15.429   16.448
 cp_fm_cholesky_invert               11 10.9   16.231   16.244   16.231   16.244
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   16.137   16.139
 multiply_cannon_sync_h2d         20056 15.6   14.137   15.512   14.137   15.512
 sum_up_and_integrate               128 10.3    0.134    0.145   15.241   15.269
 integrate_v_rspace                 128 11.3    0.004    0.004   15.107   15.133
 qs_rho_update_rho_low              128  7.7    0.001    0.001   15.079   15.117
 calculate_rho_elec                 128  8.7    0.132    0.148   15.078   15.117
 cp_fm_diag_elpa                     83 13.4    0.000    0.001   12.576   12.595
 cp_fm_diag_elpa_base                83 14.4   12.125   12.299   12.571   12.590
 init_scf_run                        11  5.9    0.000    0.001   11.471   11.471
 scf_env_initial_rho_setup           11  6.9    0.000    0.001   11.471   11.471
 make_images_data                  5014 15.6    0.059    0.067    9.780   11.131
 hybrid_alltoall_any               5200 16.5    0.437    1.997    8.382   10.427
 density_rs2pw                      128  9.7    0.006    0.007    7.827    9.868
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.536    9.792
 multiply_cannon_metrocomm4       17549 15.6    0.062    0.074    3.462    9.302
 pw_transfer                       1547 11.6    0.087    0.105    9.158    9.269
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    8.933    9.048
 mp_irecv_dv                      50230 16.2    3.342    9.045    3.342    9.045
 dbcsr_mm_accdrv_process          41502 16.2    4.983    5.301    8.187    8.299
 cp_fm_cholesky_decompose            22 10.9    8.225    8.253    8.225    8.253
 rs_pw_transfer                    1046 11.9    0.014    0.015    6.085    8.070
 fft_wrap_pw1pw2_140                523 13.2    1.413    1.441    7.746    7.874
 wfi_extrapolate                     11  7.9    0.001    0.001    7.793    7.793
 grid_integrate_task_list           128 12.3    7.257    7.767    7.257    7.767
 cp_fm_upper_to_full                105 14.8    5.795    7.481    5.795    7.481
 dbcsr_complete_redistribute        395 12.7    1.177    1.212    4.928    6.733
 fft3d_ps                          1291 14.7    2.738    2.971    6.054    6.136
 calculate_dm_sparse                128  9.5    0.001    0.001    5.878    5.991
 grid_collocate_task_list           128  9.7    4.997    5.818    4.997    5.818
 mp_alltoall_d11v                  2415 14.1    4.644    5.757    4.644    5.757
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.581    5.588
 copy_fm_to_dbcsr                   209 11.7    0.002    0.002    3.698    5.500
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.733    5.410
 potential_pw2rs                    128 12.3    0.021    0.023    5.275    5.303
 mp_allgather_i34                  2507 14.6    1.842    5.183    1.842    5.183
 mp_sum_l                          7950 12.9    3.699    5.164    3.699    5.164
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.613    4.740
 mp_waitany                       11748 13.9    2.416    4.537    2.416    4.537
 transfer_fm_to_dbcsr                11  9.9    0.019    0.027    2.511    4.277
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.183    4.213
 mp_alltoall_i22                    716 14.1    2.137    4.122    2.137    4.122
 rs_pw_transfer_RS2PW_140           139 11.5    0.336    0.357    1.946    3.962
 qs_energies_init_hamiltonians       11  5.9    0.001    0.002    3.777    3.779
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="404", plot="h2o_256_md", label="(8n/4r/3t)", y=187.032000, yerr=0.000000
PlotPoint: name="405", plot="h2o_256_md_mem", label="(8n/4r/3t)", y=904.000000, yerr=9.486833
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/18/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420242647040       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528903135232       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514766E+12       0.0%      0.0%    100.0%
 flops max/rank                      4.353791E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755941440       0.0%      0.0%    100.0%
 number of processed stacks               5977344       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    1130.3
 marketing flops                   144.580175E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.148412E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                 1143192
 MPI messages size (bytes):
  total size                         2.023815E+12
  min size                           0.000000E+00
  max size                          17.653760E+06
  average size                       1.770320E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                6996                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 396                  8650752
     32768 < size <=   131072              319024              36042702848
    131072 < size <=  4194304              715736             785529176064
   4194304 < size <= 16777216               70320             665379475120
  16777216 < size                           30720             536870912000
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4002                  58205.
 MP_Allreduce        11082                   1082.
 MP_Sync                87
 MP_Alltoall          1712               12503107.
 MP_SendRecv          5888                  75008.
 MP_ISendRecv         5888                  75008.
 MP_Wait             22442
 MP_ISend            14952                 244818.
 MP_IRecv            14952                 244818.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.015    0.031  196.473  196.474
 qs_mol_dyn_low                       1  2.0    0.003    0.004  196.013  196.039
 qs_forces                           11  3.9    0.004    0.004  195.854  195.858
 qs_energies                         11  4.9    0.070    0.078  188.572  188.583
 scf_env_do_scf                      11  5.9    0.001    0.001  169.361  169.368
 velocity_verlet                     10  3.0    0.001    0.002  128.594  128.599
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  121.228  121.229
 qs_scf_new_mos                     117  7.6    0.001    0.001   82.786   83.048
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   82.785   83.047
 dbcsr_multiply_generic            2507 12.6    0.185    0.190   80.993   81.732
 ot_scf_mini                        117  9.6    0.003    0.003   78.276   78.571
 multiply_cannon                   2507 13.6    0.556    0.596   56.059   59.272
 multiply_cannon_loop              2507 14.6    1.180    1.207   52.131   53.851
 init_scf_loop                       11  6.9    0.000    0.000   48.009   48.010
 prepare_preconditioner              11  7.9    0.000    0.000   43.869   43.907
 make_preconditioner                 11  8.9    0.000    0.000   43.869   43.907
 ot_mini                            117 10.6    0.001    0.001   43.025   43.313
 make_full_inverse_cholesky          11  9.9    0.011    0.023   37.563   42.595
 multiply_cannon_multrec          30084 15.6   14.259   19.494   26.696   31.577
 rebuild_ks_matrix                  128  8.3    0.001    0.001   30.415   30.701
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.020   30.415   30.700
 qs_ks_update_qs_env                128  7.6    0.001    0.001   27.378   27.636
 mp_waitall_1                    147882 16.7   17.646   27.322   17.646   27.322
 qs_ot_get_derivative               117 11.6    0.002    0.002   23.281   23.579
 make_m2s                          5014 13.6    0.096    0.100   20.561   21.868
 qs_ot_get_p                        128 10.4    0.001    0.001   21.422   21.735
 make_images                       5014 14.6    1.942    2.249   20.255   21.563
 apply_preconditioner_dbcsr         128 12.6    0.000    0.001   19.119   19.690
 apply_single                       128 13.6    0.001    0.001   19.119   19.690
 ot_diis_step                       117 11.6    0.018    0.018   19.613   19.614
 qs_ot_p2m_diag                      83 11.4    0.343    0.390   17.226   17.278
 cp_fm_cholesky_invert               11 10.9   17.231   17.240   17.231   17.240
 cp_fm_upper_to_full                105 14.8   11.548   17.005   11.548   17.005
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   15.897   15.899
 sum_up_and_integrate               128 10.3    0.140    0.152   15.238   15.265
 qs_rho_update_rho_low              128  7.7    0.001    0.001   15.184   15.214
 calculate_rho_elec                 128  8.7    0.175    0.192   15.183   15.214
 integrate_v_rspace                 128 11.3    0.004    0.004   15.098   15.129
 multiply_cannon_metrocomm3       30084 15.6    0.048    0.050    6.369   14.893
 init_scf_run                        11  5.9    0.000    0.001   12.826   12.827
 scf_env_initial_rho_setup           11  6.9    0.000    0.001   12.825   12.827
 dbcsr_complete_redistribute        395 12.7    1.522    1.645    9.039   12.791
 make_images_data                  5014 15.6    0.061    0.066   11.041   12.621
 cp_fm_diag_elpa                     83 13.4    0.000    0.000   12.586   12.601
 cp_fm_diag_elpa_base                83 14.4   11.583   11.915   12.578   12.591
 multiply_cannon_sync_h2d         30084 15.6   11.647   12.550   11.647   12.550
 dbcsr_mm_accdrv_process          62264 16.2    7.686    8.741   12.006   12.520
 hybrid_alltoall_any               5200 16.5    0.527    2.215    9.922   11.987
 copy_fm_to_dbcsr                   209 11.7    0.001    0.002    7.609   11.391
 transfer_fm_to_dbcsr                11  9.9    0.001    0.006    6.283   10.002
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    9.625    9.848
 pw_transfer                       1547 11.6    0.087    0.102    9.385    9.476
 mp_alltoall_i22                    716 14.1    5.529    9.269    5.529    9.269
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    9.158    9.256
 density_rs2pw                      128  9.7    0.006    0.006    7.469    8.806
 cp_fm_cholesky_decompose            22 10.9    8.131    8.219    8.131    8.219
 fft_wrap_pw1pw2_140                523 13.2    1.558    1.591    8.022    8.126
 grid_integrate_task_list           128 12.3    7.460    8.002    7.460    8.002
 wfi_extrapolate                     11  7.9    0.001    0.001    7.875    7.875
 multiply_cannon_metrocomm4       25070 15.6    0.080    0.090    2.766    6.992
 calculate_dm_sparse                128  9.5    0.001    0.001    6.822    6.886
 rs_pw_transfer                    1046 11.9    0.013    0.014    5.351    6.744
 mp_irecv_dv                      76098 16.2    2.618    6.725    2.618    6.725
 mp_alltoall_d11v                  2415 14.1    5.126    6.241    5.126    6.241
 fft3d_ps                          1291 14.7    2.842    2.935    6.085    6.160
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.956    6.012
 grid_collocate_task_list           128  9.7    5.153    5.868    5.153    5.868
 potential_pw2rs                    128 12.3    0.024    0.024    4.960    4.981
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.826    4.895
 calculate_first_density_matrix       1  7.0    0.000    0.000    4.735    4.738
 qs_ot_get_derivative_taylor         40 13.0    0.001    0.001    4.581    4.667
 qs_energies_init_hamiltonians       11  5.9    0.028    0.031    4.495    4.496
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    4.285    4.342
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="406", plot="h2o_256_md", label="(8n/3r/4t)", y=196.474000, yerr=0.000000
PlotPoint: name="407", plot="h2o_256_md_mem", label="(8n/3r/4t)", y=1079.090909, yerr=19.888117
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/19/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1420242647040       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1943472701440       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1972057190400       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1977770336256       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2734287699968       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4416300122112       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5397700149248       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5443971710976       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6653743202304       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11528903135232       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15129160814592       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15129160814592       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19767995056128       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        93.514766E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.865089E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6755941440       0.0%      0.0%    100.0%
 number of processed stacks               1960712       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3445.7
 marketing flops                   144.579337E+12
 -------------------------------------------------------------------------------
 # multiplications                           2507
 max memory usage/rank               1.531679E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                  240672
 MPI messages size (bytes):
  total size                         1.331455E+12
  min size                           0.000000E+00
  max size                          52.428800E+06
  average size                       5.532238E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                1452                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                 132                  8650752
    131072 < size <=  4194304              113904              59718500352
   4194304 < size <= 16777216              104976             550376570880
  16777216 < size                           20208             721350232272
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast               14                     12.
 MP_Allreduce         8931                     51.
 MP_Alltoall          9654                 799394.
 MP_ISend            40068                2102573.
 MP_IRecv            40068                2101676.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4002                  58203.
 MP_Allreduce        11082                   1166.
 MP_Sync                87
 MP_Alltoall          1712               18838222.
 MP_SendRecv          3840                 122880.
 MP_ISendRecv         3840                 122880.
 MP_Wait             16122
 MP_ISend            10680                 423556.
 MP_IRecv            10680                 423556.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.021    0.033  174.049  174.050
 qs_mol_dyn_low                       1  2.0    0.003    0.003  173.610  173.623
 qs_forces                           11  3.9    0.004    0.004  173.468  173.473
 qs_energies                         11  4.9    0.004    0.005  165.863  165.869
 scf_env_do_scf                      11  5.9    0.001    0.001  148.054  148.059
 velocity_verlet                     10  3.0    0.002    0.002  113.951  113.957
 scf_env_do_scf_inner_loop          117  6.6    0.003    0.008  112.113  112.114
 dbcsr_multiply_generic            2507 12.6    0.179    0.191   73.805   74.266
 qs_scf_new_mos                     117  7.6    0.001    0.001   73.852   73.972
 qs_scf_loop_do_ot                  117  8.6    0.001    0.001   73.851   73.972
 ot_scf_mini                        117  9.6    0.003    0.004   69.450   69.505
 multiply_cannon                   2507 13.6    0.587    0.612   54.382   58.533
 multiply_cannon_loop              2507 14.6    0.447    0.457   49.689   50.322
 ot_mini                            117 10.6    0.001    0.001   39.474   39.525
 init_scf_loop                       11  6.9    0.000    0.000   35.793   35.795
 mp_waitall_1                    125778 16.7   25.221   32.229   25.221   32.229
 prepare_preconditioner              11  7.9    0.000    0.000   31.892   31.912
 make_preconditioner                 11  8.9    0.000    0.000   31.892   31.912
 make_full_inverse_cholesky          11  9.9    0.018    0.028   29.801   30.072
 rebuild_ks_matrix                  128  8.3    0.001    0.001   29.632   29.673
 qs_ks_build_kohn_sham_matrix       128  9.3    0.017    0.018   29.632   29.672
 qs_ks_update_qs_env                128  7.6    0.001    0.001   26.832   26.869
 multiply_cannon_multrec          10028 15.6   10.469   15.589   17.972   21.536
 apply_preconditioner_dbcsr         128 12.6    0.000    0.000   19.536   19.900
 apply_single                       128 13.6    0.001    0.001   19.536   19.900
 ot_diis_step                       117 11.6    0.020    0.021   19.899   19.899
 qs_ot_get_derivative               117 11.6    0.002    0.002   19.506   19.562
 multiply_cannon_metrocomm3       10028 15.6    0.023    0.025   12.146   19.004
 make_m2s                          5014 13.6    0.066    0.070   15.865   18.325
 cp_fm_cholesky_invert               11 10.9   18.244   18.250   18.244   18.250
 make_images                       5014 14.6    2.321    2.792   15.558   18.016
 qs_ot_get_p                        128 10.4    0.001    0.001   17.163   17.231
 qs_rho_update_rho_low              128  7.7    0.001    0.001   15.294   15.340
 calculate_rho_elec                 128  8.7    0.258    0.269   15.294   15.339
 sum_up_and_integrate               128 10.3    0.179    0.189   14.931   14.971
 integrate_v_rspace                 128 11.3    0.004    0.004   14.752   14.801
 qs_ot_p2m_diag                      83 11.4    0.496    0.501   13.549   13.562
 cp_dbcsr_syevd                      83 12.4    0.005    0.006   12.408   12.410
 multiply_cannon_sync_h2d         10028 15.6   11.594   12.355   11.594   12.355
 make_images_data                  5014 15.6    0.051    0.060    9.508   11.970
 hybrid_alltoall_any               5200 16.5    0.795    3.630    9.381   11.844
 init_scf_run                        11  5.9    0.000    0.001   10.666   10.666
 scf_env_initial_rho_setup           11  6.9    0.000    0.001   10.666   10.666
 cp_fm_diag_elpa                     83 13.4    0.000    0.000    9.381    9.393
 cp_fm_diag_elpa_base                83 14.4    9.143    9.223    9.378    9.390
 pw_transfer                       1547 11.6    0.086    0.095    9.348    9.370
 fft_wrap_pw1pw2                   1291 12.7    0.010    0.011    9.124    9.152
 grid_integrate_task_list           128 12.3    7.766    8.174    7.766    8.174
 fft_wrap_pw1pw2_140                523 13.2    1.917    1.958    8.086    8.115
 cp_fm_cholesky_decompose            22 10.9    7.954    8.088    7.954    8.088
 qs_ot_get_derivative_diag           77 12.4    0.002    0.002    7.832    7.873
 dbcsr_mm_accdrv_process          20762 16.1    2.744    3.492    7.129    7.790
 density_rs2pw                      128  9.7    0.006    0.006    6.778    7.685
 wfi_extrapolate                     11  7.9    0.001    0.001    7.503    7.503
 multiply_cannon_metrocomm1       10028 15.6    0.029    0.029    4.315    7.276
 mp_allgather_i34                  2507 14.6    2.879    7.084    2.879    7.084
 calculate_dm_sparse                128  9.5    0.001    0.001    6.117    6.212
 grid_collocate_task_list           128  9.7    5.468    6.122    5.468    6.122
 fft3d_ps                          1291 14.7    2.768    2.846    5.596    5.627
 qs_energies_init_hamiltonians       11  5.9    0.208    0.272    5.553    5.555
 mp_alltoall_d11v                  2415 14.1    4.986    5.551    4.986    5.551
 dbcsr_complete_redistribute        395 12.7    2.116    2.191    5.118    5.541
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    5.315    5.330
 multiply_cannon_metrocomm4        7521 15.6    0.023    0.027    1.873    5.086
 rs_pw_transfer                    1046 11.9    0.013    0.013    4.101    5.022
 mp_irecv_dv                      28860 15.9    1.837    4.995    1.837    4.995
 potential_pw2rs                    128 12.3    0.027    0.027    4.371    4.383
 cp_dbcsr_sm_fm_multiply_core        37 10.5    0.000    0.000    4.258    4.290
 build_core_hamiltonian_matrix_      11  4.9    0.001    0.001    3.573    3.857
 qs_ot_get_orbitals                 117 10.6    0.001    0.001    3.721    3.765
 qs_ks_update_qs_env_forces          11  4.9    0.000    0.000    3.698    3.701
 copy_fm_to_dbcsr                   209 11.7    0.001    0.002    3.332    3.640
 copy_dbcsr_to_fm                   186 11.8    0.004    0.004    3.541    3.611
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="408", plot="h2o_256_md", label="(8n/2r/6t)", y=174.050000, yerr=0.000000
PlotPoint: name="409", plot="h2o_256_md_mem", label="(8n/2r/6t)", y=1434.818182, yerr=54.139245
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/20/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops     9 x     9 x    32        1430454546432       0.0%      0.0%    100.0%
 flops    32 x    32 x    32        1975684956160       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        1986255912960       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        1992006770688       0.0%      0.0%    100.0%
 flops    22 x    22 x    32        2753958699008       0.0%      0.0%    100.0%
 flops    32 x    32 x     9        4454954827776       0.0%      0.0%    100.0%
 flops    32 x    32 x    22        5444944789504       0.0%      0.0%    100.0%
 flops     9 x    32 x    32        5492290093056       0.0%      0.0%    100.0%
 flops    22 x    32 x    32        6712799002624       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       11613065416704       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       15239182565376       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       15239182565376       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       19911132921856       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                        94.245913E+12       0.0%      0.0%    100.0%
 flops max/rank                     11.787674E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         6806580192       0.0%      0.0%    100.0%
 number of processed stacks               1982496       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0    3433.3
 marketing flops                   145.663816E+12
 -------------------------------------------------------------------------------
 # multiplications                           2535
 max memory usage/rank               3.164959E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                  101400
 MPI messages size (bytes):
  total size                         1.145171E+12
  min size                           0.000000E+00
  max size                         104.857600E+06
  average size                      11.293599E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 572                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                  44                  2883584
    131072 < size <=  4194304               45888              35634806784
   4194304 < size <= 16777216               44720             382939955200
  16777216 < size                           10176             726592540656
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4057                  58363.
 MP_Allreduce        11219                   1499.
 MP_Sync                88
 MP_Alltoall          1724               36993632.
 MP_SendRecv          1806                 218624.
 MP_ISendRecv         1806                 218624.
 MP_Wait              9876
 MP_ISend             6456                1080169.
 MP_IRecv             6456                1080169.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.023    0.042  308.514  308.515
 qs_mol_dyn_low                       1  2.0    0.003    0.004  307.928  307.954
 qs_forces                           11  3.9    0.005    0.009  307.365  307.368
 qs_energies                         11  4.9    0.002    0.002  297.954  297.964
 scf_env_do_scf                      11  5.9    0.001    0.001  274.734  274.750
 velocity_verlet                     10  3.0    0.002    0.003  221.215  221.282
 scf_env_do_scf_inner_loop          118  6.6    0.003    0.008  147.380  147.383
 init_scf_loop                       11  6.9    0.000    0.000  127.098  127.102
 prepare_preconditioner              11  7.9    0.000    0.000  121.752  121.774
 make_preconditioner                 11  8.9    0.000    0.000  121.752  121.774
 make_full_inverse_cholesky          11  9.9    0.037    0.039   97.609  118.852
 qs_scf_new_mos                     118  7.6    0.001    0.001   94.982   95.101
 qs_scf_loop_do_ot                  118  8.6    0.001    0.001   94.981   95.100
 ot_scf_mini                        118  9.6    0.004    0.004   90.115   90.133
 dbcsr_multiply_generic            2535 12.6    0.212    0.222   85.643   86.168
 cp_fm_upper_to_full                106 14.8   53.510   76.683   53.510   76.683
 multiply_cannon                   2535 13.6    0.716    0.798   59.704   61.015
 multiply_cannon_loop              2535 14.6    0.476    0.484   55.558   56.816
 ot_mini                            118 10.6    0.001    0.001   46.130   46.148
 dbcsr_complete_redistribute        397 12.7    3.975    4.029   29.896   42.784
 rebuild_ks_matrix                  129  8.3    0.001    0.001   39.526   39.557
 qs_ks_build_kohn_sham_matrix       129  9.3    0.017    0.018   39.525   39.557
 copy_fm_to_dbcsr                   210 11.7    0.001    0.002   26.496   39.412
 transfer_fm_to_dbcsr                11  9.9    0.029    0.034   24.098   36.886
 qs_ks_update_qs_env                129  7.6    0.001    0.001   36.466   36.489
 mp_alltoall_i22                    720 14.1   21.871   34.904   21.871   34.904
 cp_fm_cholesky_invert               11 10.9   34.444   34.451   34.444   34.451
 mp_waitall_1                    104820 16.8   29.529   33.960   29.529   33.960
 qs_ot_get_p                        129 10.4    0.001    0.001   28.327   28.399
 qs_ot_get_derivative               118 11.6    0.002    0.002   25.114   25.133
 qs_ot_p2m_diag                      84 11.4    0.889    0.894   24.097   24.125
 make_m2s                          5070 13.6    0.076    0.078   21.337   22.661
 cp_dbcsr_syevd                      84 12.4    0.006    0.006   22.286   22.288
 make_images                       5070 14.6    3.782    3.915   20.854   22.182
 qs_rho_update_rho_low              129  7.7    0.001    0.001   21.314   21.331
 calculate_rho_elec                 129  8.7    0.483    0.484   21.313   21.330
 ot_diis_step                       118 11.6    0.022    0.024   20.981   20.982
 apply_preconditioner_dbcsr         129 12.6    0.000    0.000   20.207   20.432
 apply_single                       129 13.6    0.001    0.001   20.206   20.432
 sum_up_and_integrate               129 10.3    0.324    0.328   20.278   20.362
 multiply_cannon_metrocomm3       10140 15.6    0.023    0.026   18.844   20.078
 integrate_v_rspace                 129 11.3    0.004    0.004   19.953   20.038
 cp_fm_diag_elpa                     84 13.4    0.000    0.000   18.848   18.849
 cp_fm_diag_elpa_base                84 14.4   14.245   15.944   18.844   18.844
 multiply_cannon_multrec          10140 15.6   10.604   12.406   18.250   18.353
 multiply_cannon_sync_h2d         10140 15.6   15.789   15.813   15.789   15.813
 hybrid_alltoall_any               5257 16.5    1.314    3.081   11.744   14.116
 make_images_data                  5070 15.6    0.058    0.064   11.723   14.102
 init_scf_run                        11  5.9    0.000    0.001   12.668   12.668
 scf_env_initial_rho_setup           11  6.9    0.000    0.000   12.668   12.668
 pw_transfer                       1559 11.6    0.094    0.095   12.134   12.141
 fft_wrap_pw1pw2                   1301 12.7    0.011    0.011   11.896   11.903
 fft_wrap_pw1pw2_140                527 13.2    3.101    3.150   10.489   10.498
 qs_ot_get_derivative_diag           78 12.4    0.002    0.002    9.943    9.963
 mp_alltoall_d11v                  2429 14.1    8.877    9.847    8.877    9.847
 cp_fm_cholesky_decompose            22 10.9    9.603    9.629    9.603    9.629
 wfi_extrapolate                     11  7.9    0.001    0.001    9.297    9.298
 dbcsr_mm_accdrv_process          20958 16.1    3.883    5.820    7.405    9.295
 grid_integrate_task_list           129 12.3    8.659    8.863    8.659    8.863
 density_rs2pw                      129  9.7    0.005    0.005    8.548    8.699
 qs_energies_init_hamiltonians       11  5.9    0.001    0.001    8.454    8.455
 calculate_dm_sparse                129  9.5    0.001    0.001    6.907    7.024
 fft3d_ps                          1301 14.7    2.875    2.896    6.930    6.962
 grid_collocate_task_list           129  9.7    6.493    6.573    6.493    6.573
 rs_scatter_matrices                140  9.7    3.702    4.624    6.241    6.463
 cp_dbcsr_sm_fm_multiply             37  9.5    0.002    0.002    6.369    6.439
 copy_dbcsr_to_fm                   187 11.8    0.004    0.004    6.104    6.187
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="410", plot="h2o_256_md", label="(8n/1r/12t)", y=308.515000, yerr=0.000000
PlotPoint: name="411", plot="h2o_256_md_mem", label="(8n/1r/12t)", y=2881.909091, yerr=160.940675
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/21/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      2.766000E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                419739       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   22952.9
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               1.259848E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  458208
 MPI messages size (bytes):
  total size                         3.456111E+12
  min size                           0.000000E+00
  max size                          18.735064E+06
  average size                       7.542668E+06
 MPI breakdown and total messages size (bytes):
             size <=      128              112896                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                 224                  5687808
     32768 < size <=   131072               10528                813356544
    131072 < size <=  4194304               36422              76284728544
   4194304 < size <= 16777216              294266            3312457683808
  16777216 < size                            3872              66548597808
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 255646.
 MP_Allreduce         3139                   6114.
 MP_Sync                 4
 MP_Alltoall            54                9239733.
 MP_SendRecv           285                  19200.
 MP_ISendRecv          285                  19200.
 MP_Wait              1017
 MP_ISend              642                 197829.
 MP_IRecv              642                 197607.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.017    0.043   86.004   86.005
 qs_energies                          1  2.0    0.000    0.000   84.939   84.946
 ls_scf                               1  3.0    0.000    0.000   83.883   83.890
 dbcsr_multiply_generic             111  6.7    0.015    0.016   72.743   72.929
 multiply_cannon                    111  7.7    0.017    0.021   55.955   57.259
 multiply_cannon_loop               111  8.7    0.208    0.220   52.505   53.812
 ls_scf_main                          1  4.0    0.000    0.000   52.534   52.534
 density_matrix_trs4                  2  5.0    0.002    0.003   46.931   47.008
 ls_scf_init_scf                      1  4.0    0.000    0.000   28.307   28.308
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   27.169   27.222
 mp_waitall_1                     11031 10.9   22.581   26.158   22.581   26.158
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.004   25.054   25.072
 multiply_cannon_multrec           2664  9.7    8.216    8.850   15.604   17.212
 multiply_cannon_sync_h2d          2664  9.7   13.574   15.195   13.574   15.195
 make_m2s                           222  7.7    0.009    0.012   13.174   13.695
 make_images                        222  8.7    0.101    0.112   13.153   13.676
 multiply_cannon_metrocomm1        2664  9.7    0.009    0.010    9.764   13.179
 multiply_cannon_metrocomm3        2664  9.7    0.009    0.011    5.387    9.293
 make_images_data                   222  9.7    0.005    0.006    7.770    8.399
 hybrid_alltoall_any                227 10.6    0.219    1.847    6.556    8.076
 dbcsr_mm_accdrv_process           4760 10.4    0.510    0.641    7.007    7.980
 dbcsr_mm_accdrv_process_sort      4760 11.4    6.298    7.175    6.298    7.175
 calculate_norms                   4752  9.8    5.517    6.167    5.517    6.167
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.030    5.141
 mp_sum_l                           887  5.1    3.084    4.626    3.084    4.626
 multiply_cannon_metrocomm4        2442  9.7    0.011    0.014    2.041    3.895
 mp_irecv_dv                       6231 10.9    2.025    3.865    2.025    3.865
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    2.319    3.572
 make_images_sizes                  222  9.7    0.000    0.000    0.796    3.459
 mp_alltoall_i44                    222 10.7    0.796    3.458    0.796    3.458
 arnoldi_extremal                     4  6.8    0.000    0.000    3.235    3.264
 arnoldi_normal_ev                    4  7.8    0.002    0.012    3.235    3.264
 build_subspace                      16  8.4    0.009    0.012    3.131    3.133
 ls_scf_post                          1  4.0    0.000    0.000    3.042    3.049
 ls_scf_store_result                  1  5.0    0.000    0.000    2.861    2.905
 dbcsr_special_finalize             555  9.7    0.005    0.006    2.244    2.774
 dbcsr_merge_single_wm              555 10.7    0.452    0.575    2.236    2.765
 make_images_pack                   222  9.7    2.215    2.630    2.217    2.632
 dbcsr_matrix_vector_mult           304  9.0    0.006    0.013    2.318    2.577
 dbcsr_sort_data                    658 11.4    2.042    2.531    2.042    2.531
 dbcsr_matrix_vector_mult_local     304 10.0    2.067    2.476    2.069    2.478
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.371    2.453
 buffer_matrices_ensure_size        222  8.7    1.746    2.089    1.746    2.089
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.803    1.805
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.794    1.795
 qs_ks_build_kohn_sham_matrix         3  8.3    0.000    0.001    1.794    1.795
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="500", plot="h2o_32_nrep3_ls", label="(8n/12r/1t)", y=86.005000, yerr=0.000000
PlotPoint: name="501", plot="h2o_32_nrep3_ls_mem", label="(8n/12r/1t)", y=1139.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/22/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      5.588524E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                368848       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26119.8
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.090025E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                  106560
 MPI messages size (bytes):
  total size                         2.699093E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      25.329324E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               23040                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                3264                325830144
    131072 < size <=  4194304                5280               3328561104
   4194304 < size <= 16777216               12709             156766962056
  16777216 < size                           62267            2538670978840
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266673.
 MP_Allreduce         3138                  10075.
 MP_Sync                 4
 MP_Alltoall            47               15335933.
 MP_SendRecv           141                  57600.
 MP_ISendRecv          141                  57600.
 MP_Wait               687
 MP_ISend              462                 414589.
 MP_IRecv              462                 413870.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.033    0.054   91.957   91.958
 qs_energies                          1  2.0    0.000    0.000   91.335   91.339
 ls_scf                               1  3.0    0.000    0.000   89.935   89.938
 dbcsr_multiply_generic             111  6.7    0.015    0.016   75.722   76.026
 multiply_cannon                    111  7.7    0.030    0.044   53.632   57.130
 ls_scf_main                          1  4.0    0.000    0.000   55.206   55.212
 multiply_cannon_loop               111  8.7    0.116    0.123   50.320   53.137
 density_matrix_trs4                  2  5.0    0.002    0.003   49.424   49.631
 ls_scf_init_scf                      1  4.0    0.000    0.000   31.178   31.179
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   29.935   30.043
 mp_waitall_1                      9105 10.9   21.204   29.099   21.204   29.099
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   27.446   27.467
 multiply_cannon_multrec           1332  9.7   13.304   16.794   22.629   27.216
 multiply_cannon_metrocomm3        1332  9.7    0.006    0.007   11.930   20.122
 make_m2s                           222  7.7    0.008    0.009   15.284   15.881
 make_images                        222  8.7    1.585    1.932   15.253   15.850
 dbcsr_mm_accdrv_process           4041 10.4    0.286    0.447    8.917   10.516
 dbcsr_mm_accdrv_process_sort      4041 11.4    8.512   10.077    8.512   10.077
 make_images_data                   222  9.7    0.004    0.005    8.717    9.686
 hybrid_alltoall_any                227 10.6    0.522    2.428    8.152    9.054
 mp_sum_l                           887  5.1    5.514    8.098    5.514    8.098
 multiply_cannon_metrocomm4        1221  9.7    0.006    0.009    3.253    7.713
 mp_irecv_dv                       3311 11.0    3.233    7.658    3.233    7.658
 calculate_norms                   2376  9.8    5.989    6.765    5.989    6.765
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    4.204    6.508
 multiply_cannon_sync_h2d          1332  9.7    4.744    5.838    4.744    5.838
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.075    5.250
 arnoldi_extremal                     4  6.8    0.000    0.000    4.701    4.723
 arnoldi_normal_ev                    4  7.8    0.001    0.005    4.701    4.723
 build_subspace                      16  8.4    0.014    0.021    4.443    4.447
 ls_scf_post                          1  4.0    0.000    0.000    3.551    3.556
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.022    3.199    3.428
 ls_scf_store_result                  1  5.0    0.000    0.000    3.251    3.375
 dbcsr_matrix_vector_mult_local     304 10.0    2.777    3.269    2.779    3.271
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.004    1.189    2.843
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.636    2.713
 make_images_pack                   222  9.7    2.026    2.464    2.029    2.466
 mp_allgather_i34                   111  8.7    1.004    2.305    1.004    2.305
 dbcsr_sort_data                    436 11.2    1.803    2.073    1.803    2.073
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.889    1.891
 rebuild_ks_matrix                    3  7.3    0.000    0.000    1.876    1.878
 qs_ks_build_kohn_sham_matrix         3  8.3    0.001    0.006    1.876    1.878
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="502", plot="h2o_32_nrep3_ls", label="(8n/6r/2t)", y=91.958000, yerr=0.000000
PlotPoint: name="503", plot="h2o_32_nrep3_ls_mem", label="(8n/6r/2t)", y=1850.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/23/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                      8.404608E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                353133       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   27282.1
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               2.776252E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                   46176
 MPI messages size (bytes):
  total size                         1.924064E+12
  min size                           0.000000E+00
  max size                         108.059888E+06
  average size                      41.668048E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                9984                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                3328               1170063360
   4194304 < size <= 16777216                1870              19378539600
  16777216 < size                           30994            1903514987232
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265448.
 MP_Allreduce         3138                  10896.
 MP_Sync                 4
 MP_Alltoall            47               23526250.
 MP_SendRecv            93                  57600.
 MP_ISendRecv           93                  57600.
 MP_Wait               639
 MP_ISend              462                 560046.
 MP_IRecv              462                 560662.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.034    0.050   94.254   94.255
 qs_energies                          1  2.0    0.000    0.000   93.583   93.655
 ls_scf                               1  3.0    0.000    0.000   91.977   92.048
 dbcsr_multiply_generic             111  6.7    0.016    0.016   76.227   76.481
 ls_scf_main                          1  4.0    0.000    0.000   57.384   57.389
 multiply_cannon                    111  7.7    0.037    0.074   52.690   56.864
 multiply_cannon_loop               111  8.7    0.100    0.105   49.055   52.721
 density_matrix_trs4                  2  5.0    0.002    0.003   51.381   51.591
 mp_waitall_1                      7281 11.0   23.133   33.028   23.133   33.028
 ls_scf_init_scf                      1  4.0    0.000    0.000   30.835   30.838
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   29.628   29.710
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.002   27.076   27.092
 multiply_cannon_multrec            888  9.7   12.697   15.107   21.331   24.532
 multiply_cannon_metrocomm3         888  9.7    0.004    0.004   10.709   22.168
 make_m2s                           222  7.7    0.007    0.008   16.884   18.214
 make_images                        222  8.7    1.986    2.303   16.845   18.174
 hybrid_alltoall_any                227 10.6    0.625    2.867    9.138   10.548
 make_images_data                   222  9.7    0.004    0.005    9.518   10.536
 dbcsr_mm_accdrv_process           3754 10.4    0.232    0.431    8.155    9.396
 dbcsr_mm_accdrv_process_sort      3754 11.4    7.767    8.965    7.767    8.965
 mp_sum_l                           887  5.1    5.095    8.505    5.095    8.505
 multiply_cannon_sync_h2d           888  9.7    6.002    7.722    6.002    7.722
 multiply_cannon_metrocomm4         777  9.7    0.004    0.005    2.481    7.174
 mp_irecv_dv                       2335 11.1    2.465    7.135    2.465    7.135
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    3.926    6.673
 multiply_cannon_metrocomm1         888  9.7    0.003    0.003    3.533    6.141
 arnoldi_extremal                     4  6.8    0.000    0.000    5.153    5.170
 arnoldi_normal_ev                    4  7.8    0.001    0.005    5.153    5.170
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.969    5.144
 build_subspace                      16  8.4    0.014    0.020    4.851    4.857
 calculate_norms                   1584  9.8    4.345    4.758    4.345    4.758
 ls_scf_post                          1  4.0    0.000    0.000    3.758    3.832
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.021    3.482    3.825
 mp_allgather_i34                   111  8.7    1.456    3.740    1.456    3.740
 dbcsr_matrix_vector_mult_local     304 10.0    3.053    3.638    3.055    3.640
 ls_scf_store_result                  1  5.0    0.000    0.000    3.301    3.386
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    2.889    3.010
 dbcsr_sort_data                    325 11.1    1.909    2.181    1.909    2.181
 make_images_pack                   222  9.7    1.820    2.101    1.823    2.104
 dbcsr_data_release                9322 10.9    1.331    1.977    1.331    1.977
 make_images_sizes                  222  9.7    0.000    0.000    0.982    1.958
 mp_alltoall_i44                    222 10.7    0.981    1.958    0.981    1.958
 dbcsr_finalize                     304  7.8    0.027    0.032    1.618    1.904
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    1.900    1.902
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="504", plot="h2o_32_nrep3_ls", label="(8n/4r/3t)", y=94.255000, yerr=0.000000
PlotPoint: name="505", plot="h2o_32_nrep3_ls_mem", label="(8n/4r/3t)", y=2198.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/24/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     10.747127E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                369794       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   26053.0
 marketing flops                     1.742116E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               3.343626E+09
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged                   50616
 MPI messages size (bytes):
  total size                         1.536549E+12
  min size                           0.000000E+00
  max size                          72.286792E+06
  average size                      30.356986E+06
 MPI breakdown and total messages size (bytes):
             size <=      128               10368                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                1056                104411904
    131072 < size <=  4194304                3168                831638784
   4194304 < size <= 16777216                3103              33613273640
  16777216 < size                           32921            1501999894888
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 266673.
 MP_Allreduce         3138                  13030.
 MP_Sync                 4
 MP_Alltoall            47               30278988.
 MP_SendRecv            69                  86400.
 MP_ISendRecv           69                  86400.
 MP_Wait               531
 MP_ISend              378                 823502.
 MP_IRecv              378                 823753.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.051    0.088   99.188   99.189
 qs_energies                          1  2.0    0.000    0.000   98.411   98.416
 ls_scf                               1  3.0    0.000    0.000   96.705   96.712
 dbcsr_multiply_generic             111  6.7    0.018    0.020   79.083   79.330
 ls_scf_main                          1  4.0    0.001    0.013   59.260   59.263
 multiply_cannon                    111  7.7    0.068    0.180   52.084   56.867
 density_matrix_trs4                  2  5.0    0.015    0.161   53.049   53.216
 multiply_cannon_loop               111  8.7    0.123    0.150   46.965   50.437
 ls_scf_init_scf                      1  4.0    0.000    0.001   34.141   34.143
 ls_scf_init_matrix_S                 1  5.0    0.000    0.001   32.343   32.424
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.002   29.800   29.825
 mp_waitall_1                      6369 11.0   22.659   28.921   22.659   28.921
 multiply_cannon_multrec           1332  9.7   14.256   17.626   22.215   25.303
 make_m2s                           222  7.7    0.021    0.053   20.973   22.439
 make_images                        222  8.7    3.152    3.629   20.907   22.390
 multiply_cannon_metrocomm3        1332  9.7    0.003    0.004    9.294   17.523
 make_images_data                   222  9.7    0.004    0.005   11.587   13.229
 hybrid_alltoall_any                227 10.6    0.803    3.767   10.879   12.656
 dbcsr_mm_accdrv_process           3641 10.4    0.229    0.409    7.586    9.104
 dbcsr_mm_accdrv_process_sort      3641 11.4    7.212    8.692    7.212    8.692
 mp_sum_l                           887  5.1    4.435    8.207    4.435    8.207
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    3.363    6.440
 multiply_cannon_metrocomm4        1110  9.7    0.004    0.006    2.092    6.140
 mp_irecv_dv                       3229 10.9    2.068    6.052    2.068    6.052
 multiply_cannon_sync_h2d          1332  9.7    5.499    6.021    5.499    6.021
 multiply_cannon_metrocomm1        1332  9.7    0.003    0.003    2.611    5.350
 arnoldi_extremal                     4  6.8    0.000    0.000    5.305    5.333
 arnoldi_normal_ev                    4  7.8    0.001    0.005    5.304    5.333
 build_subspace                      16  8.4    0.014    0.021    4.947    4.955
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.660    4.881
 mp_allgather_i34                   111  8.7    2.199    4.564    2.199    4.564
 calculate_norms                   2376  9.8    4.203    4.533    4.203    4.533
 dbcsr_matrix_vector_mult           304  9.0    0.010    0.021    3.644    3.932
 dbcsr_matrix_vector_mult_local     304 10.0    3.227    3.724    3.229    3.726
 dbcsr_sort_data                    658 11.4    3.060    3.469    3.060    3.469
 ls_scf_post                          1  4.0    0.000    0.001    3.304    3.309
 dbcsr_special_finalize             555  9.7    0.006    0.007    2.800    3.277
 dbcsr_merge_single_wm              555 10.7    0.533    0.656    2.792    3.269
 ls_scf_store_result                  1  5.0    0.000    0.000    3.014    3.106
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    3.048    3.095
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.442    2.443
 dbcsr_data_release               10477 10.7    1.604    2.406    1.604    2.406
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.035    2.036
 qs_ks_build_kohn_sham_matrix         3  8.3    0.003    0.012    2.035    2.036
 dbcsr_finalize                     304  7.8    0.050    0.062    1.821    1.985
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="506", plot="h2o_32_nrep3_ls", label="(8n/3r/4t)", y=99.189000, yerr=0.000000
PlotPoint: name="507", plot="h2o_32_nrep3_ls_mem", label="(8n/3r/4t)", y=2736.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/25/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     15.383312E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                336818       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28603.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               4.685431E+09
 # max total images/rank                        1
 # max 3D layers                                1
 # MPI messages exchanged                   10656
 MPI messages size (bytes):
  total size                         1.149035E+12
  min size                           0.000000E+00
  max size                         203.538048E+06
  average size                     107.829832E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                2304                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 768                702038016
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            7584            1148332810224
 -------------------------------------------------------------------------------
 -                                                                             -
 -                      DBCSR MESSAGE PASSING PERFORMANCE                      -
 -                                                                             -
 -------------------------------------------------------------------------------
 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Bcast                2                     12.
 MP_Allreduce          705                    128.
 MP_Alltoall           310               12920694.
 MP_ISend             1776               40180424.
 MP_IRecv             1776               40465030.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 265536.
 MP_Allreduce         3129                  15263.
 MP_Sync                 4
 MP_Alltoall            47               46208988.
 MP_SendRecv            45                 115200.
 MP_ISendRecv           45                 115200.
 MP_Wait               528
 MP_ISend              420                 924980.
 MP_IRecv              420                 924528.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.040    0.058  100.093  100.094
 qs_energies                          1  2.0    0.000    0.000   99.047   99.051
 ls_scf                               1  3.0    0.000    0.000   97.108   97.111
 dbcsr_multiply_generic             111  6.7    0.018    0.018   78.166   78.361
 ls_scf_main                          1  4.0    0.000    0.000   62.430   62.431
 multiply_cannon                    111  7.7    0.101    0.184   55.713   60.551
 density_matrix_trs4                  2  5.0    0.002    0.003   55.242   55.350
 multiply_cannon_loop               111  8.7    0.069    0.078   51.123   52.760
 mp_waitall_1                      5436 11.0   26.545   32.281   26.545   32.281
 ls_scf_init_scf                      1  4.0    0.000    0.000   30.926   30.926
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   29.660   29.694
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.001   27.372   27.384
 multiply_cannon_multrec            444  9.7   14.085   16.085   21.214   24.318
 make_m2s                           222  7.7    0.006    0.006   17.557   20.119
 make_images                        222  8.7    3.738    4.450   17.494   20.058
 multiply_cannon_metrocomm1         444  9.7    0.002    0.002   11.754   16.798
 multiply_cannon_metrocomm3         444  9.7    0.001    0.002    5.777   14.867
 make_images_data                   222  9.7    0.003    0.004    9.813   12.241
 hybrid_alltoall_any                227 10.6    0.790    3.771    9.612   12.035
 multiply_cannon_sync_h2d           444  9.7    6.562    8.662    6.562    8.662
 dbcsr_mm_accdrv_process           3003 10.4    0.160    0.401    6.834    7.933
 dbcsr_mm_accdrv_process_sort      3003 11.4    6.491    7.577    6.491    7.577
 mp_allgather_i34                   111  8.7    2.809    7.061    2.809    7.061
 arnoldi_extremal                     4  6.8    0.000    0.000    6.008    6.020
 arnoldi_normal_ev                    4  7.8    0.001    0.005    6.007    6.020
 build_subspace                      16  8.4    0.015    0.019    5.616    5.629
 apply_matrix_preconditioner          6  5.3    0.000    0.000    4.663    4.824
 mp_sum_l                           887  5.1    2.936    4.655    2.936    4.655
 dbcsr_matrix_vector_mult           304  9.0    0.011    0.021    4.262    4.474
 dbcsr_matrix_vector_mult_local     304 10.0    3.733    4.217    3.735    4.219
 multiply_cannon_metrocomm4         333  9.7    0.001    0.002    1.573    3.961
 mp_irecv_dv                       1241 11.2    1.555    3.914    1.555    3.914
 ls_scf_post                          1  4.0    0.000    0.000    3.752    3.756
 calculate_norms                    792  9.8    3.561    3.753    3.561    3.753
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    3.606    3.684
 dbcsr_multiply_generic_mpsum_f      86  7.8    0.000    0.000    1.965    3.637
 ls_scf_store_result                  1  5.0    0.000    0.000    3.398    3.448
 make_images_sizes                  222  9.7    0.000    0.000    1.031    3.309
 mp_alltoall_i44                    222 10.7    1.030    3.309    1.030    3.309
 dbcsr_finalize                     304  7.8    0.062    0.078    2.206    2.283
 dbcsr_merge_all                    275  8.9    0.475    0.527    2.053    2.119
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.084    2.085
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.050    2.051
 qs_ks_build_kohn_sham_matrix         3  8.3    0.000    0.001    2.050    2.051
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="508", plot="h2o_32_nrep3_ls", label="(8n/2r/6t)", y=100.094000, yerr=0.000000
PlotPoint: name="509", plot="h2o_32_nrep3_ls_mem", label="(8n/2r/6t)", y=3723.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/26/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    23 x    23 x    23      234439235724792       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       234.439236E+12       0.0%      0.0%    100.0%
 flops max/rank                     30.358840E+12       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                         9634225188       0.0%      0.0%    100.0%
 number of processed stacks                339931       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0   28341.7
 marketing flops                     1.742118E+15
 -------------------------------------------------------------------------------
 # multiplications                            111
 max memory usage/rank               8.715518E+09
 # max total images/rank                        2
 # max 3D layers                                1
 # MPI messages exchanged                    4440
 MPI messages size (bytes):
  total size                       770.525954E+09
  min size                           0.000000E+00
  max size                         399.069120E+06
  average size                     173.541888E+06
 MPI breakdown and total messages size (bytes):
             size <=      128                 640                        0
       128 < size <=     8192                   0                        0
      8192 < size <=    32768                   0                        0
     32768 < size <=   131072                   0                        0
    131072 < size <=  4194304                 640                468025344
   4194304 < size <= 16777216                   0                        0
  16777216 < size                            3160             770057961712
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             1026                 284089.
 MP_Allreduce         3123                  21388.
 MP_Sync                 4
 MP_Alltoall            47               88727262.
 MP_SendRecv            42                 732600.
 MP_ISendRecv           42                 732600.
 MP_Wait               267
 MP_ISend              180                3337386.
 MP_IRecv              180                3339494.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.087    0.102  110.820  110.820
 qs_energies                          1  2.0    0.000    0.000  109.327  109.337
 ls_scf                               1  3.0    0.000    0.000  106.373  106.383
 dbcsr_multiply_generic             111  6.7    0.023    0.027   78.969   79.080
 ls_scf_main                          1  4.0    0.000    0.000   66.505   66.507
 density_matrix_trs4                  2  5.0    0.002    0.003   57.169   57.238
 multiply_cannon                    111  7.7    0.159    0.244   50.640   52.953
 multiply_cannon_loop               111  8.7    0.068    0.070   46.970   48.150
 ls_scf_init_scf                      1  4.0    0.000    0.000   35.806   35.807
 ls_scf_init_matrix_S                 1  5.0    0.000    0.000   34.075   34.088
 matrix_sqrt_Newton_Schulz            2  6.5    0.001    0.002   30.656   30.663
 mp_waitall_1                      4527 11.1   22.652   27.007   22.652   27.007
 make_m2s                           222  7.7    0.007    0.007   24.511   25.688
 make_images                        222  8.7    4.595    5.003   24.401   25.576
 multiply_cannon_multrec            444  9.7   17.856   18.535   22.787   23.752
 make_images_data                   222  9.7    0.003    0.004   13.376   16.358
 hybrid_alltoall_any                227 10.6    1.669    3.662   13.108   16.268
 multiply_cannon_metrocomm3         444  9.7    0.001    0.001   10.566   11.695
 multiply_cannon_sync_h2d           444  9.7    8.845    8.891    8.845    8.891
 arnoldi_extremal                     4  6.8    0.000    0.000    7.655    7.663
 arnoldi_normal_ev                    4  7.8    0.003    0.009    7.655    7.663
 build_subspace                      16  8.4    0.025    0.035    7.084    7.094
 apply_matrix_preconditioner          6  5.3    0.000    0.000    5.653    5.923
 dbcsr_matrix_vector_mult           304  9.0    0.017    0.034    5.629    5.826
 dbcsr_matrix_vector_mult_local     304 10.0    5.082    5.410    5.084    5.412
 ls_scf_dm_to_ks                      2  5.0    0.000    0.000    5.242    5.333
 dbcsr_mm_accdrv_process           1814 10.4    0.444    0.871    4.764    5.148
 dbcsr_mm_accdrv_process_sort      1814 11.4    4.180    4.306    4.180    4.306
 mp_allgather_i34                   111  8.7    1.293    4.161    1.293    4.161
 ls_scf_post                          1  4.0    0.000    0.000    4.061    4.071
 make_images_sizes                  222  9.7    0.000    0.000    1.484    3.669
 mp_alltoall_i44                    222 10.7    1.484    3.669    1.484    3.669
 ls_scf_store_result                  1  5.0    0.000    0.000    3.434    3.444
 calculate_norms                    792  9.8    3.242    3.281    3.242    3.281
 dbcsr_finalize                     304  7.8    0.082    0.089    3.091    3.188
 dbcsr_merge_all                    275  8.9    0.893    0.922    2.876    2.967
 qs_energies_init_hamiltonians        1  3.0    0.001    0.001    2.924    2.924
 dbcsr_complete_redistribute          5  7.6    1.443    1.495    2.775    2.898
 dbcsr_data_release               12724 10.6    2.332    2.870    2.332    2.870
 matrix_ls_to_qs                      2  6.0    0.000    0.000    2.428    2.568
 dbcsr_sort_data                    325 11.1    2.444    2.515    2.444    2.515
 qs_ks_update_qs_env                  3  6.3    0.000    0.000    2.490    2.491
 mp_sum_l                           887  5.1    1.518    2.473    1.518    2.473
 rebuild_ks_matrix                    3  7.3    0.000    0.000    2.423    2.425
 qs_ks_build_kohn_sham_matrix         3  8.3    0.001    0.001    2.423    2.425
 dbcsr_new_transposed                 4  7.5    0.277    0.320    2.387    2.410
 dbcsr_frobenius_norm                74  6.6    2.056    2.135    2.196    2.244
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="510", plot="h2o_32_nrep3_ls", label="(8n/1r/12t)", y=110.820000, yerr=0.000000
PlotPoint: name="511", plot="h2o_32_nrep3_ls_mem", label="(8n/1r/12t)", y=6831.000000, yerr=0.000000
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


 ~~~~~~~~~ RESULT ~~~~~~~~~
RESULT file: /scratch/snx3000/mkrack/rt/../rt/CRAY-XC50-gnu/1501a5cde42d9e664f251d02093fe2fc81c3abfc_performance_tests/27/result.log


 @@@@@@@@@@ Run number: 1 @@@@@@@@@@

 -------------------------------------------------------------------------------
 -                                                                             -
 -                                DBCSR STATISTICS                             -
 -                                                                             -
 -------------------------------------------------------------------------------
 COUNTER                                    TOTAL       BLAS       SMM       ACC
 flops    32 x    32 x    32        7009386627072       0.0%      0.0%    100.0%
 flops     9 x     9 x    32        7335108845568       0.0%      0.0%    100.0%
 flops     9 x    22 x    32        9866241589248       0.0%      0.0%    100.0%
 flops    22 x     9 x    32        9884108906496       0.0%      0.0%    100.0%
 flops    22 x    22 x    32       13354440523776       0.0%      0.0%    100.0%
 flops    32 x    32 x     9       20607185977344       0.0%      0.0%    100.0%
 flops    32 x    32 x    22       25186560638976       0.0%      0.0%    100.0%
 flops     9 x    32 x    32       28458319085568       0.0%      0.0%    100.0%
 flops    22 x    32 x    32       34782389993472       0.0%      0.0%    100.0%
 flops     9 x    32 x     9       42881542373376       0.0%      0.0%    100.0%
 flops    22 x    32 x     9       55680402235392       0.0%      0.0%    100.0%
 flops     9 x    32 x    22       55680402235392       0.0%      0.0%    100.0%
 flops    22 x    32 x    22       72328573419520       0.0%      0.0%    100.0%
 flops inhomo. stacks                           0       0.0%      0.0%      0.0%
 flops total                       383.054662E+12       0.0%      0.0%    100.0%
 flops max/rank                    733.641090E+09       0.0%      0.0%    100.0%
 matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
 matmuls total                        26899403712       0.0%      0.0%    100.0%
 number of processed stacks             118860288       0.0%      0.0%    100.0%
 average stack size                                     0.0       0.0     226.3
 marketing flops                   780.439111E+12
 -------------------------------------------------------------------------------
 # multiplications                           1445
 max memory usage/rank             590.200832E+06
 # max total images/rank                        3
 # max 3D layers                                1
 # MPI messages exchanged               102097920
 MPI messages size (bytes):
  total size                        37.227590E+12
  min size                           0.000000E+00
  max size                           4.551360E+06
  average size                     364.626312E+03
 MPI breakdown and total messages size (bytes):
             size <=      128              731472                        0
       128 < size <=     8192            11922720              97670922240
      8192 < size <=    32768            24718992             614677610496
     32768 < size <=   131072            20000256            1970081366016
    131072 < size <=  4194304            42515668           24886801223040
   4194304 < size <= 16777216             2208812            9656099886720
  16777216 < size                               0                        0
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                         MESSAGE PASSING PERFORMANCE                         -
 -                                                                             -
 -------------------------------------------------------------------------------

 ROUTINE             CALLS      AVE VOLUME [Bytes]
 MP_Group                4
 MP_Bcast             4640                  78072.
 MP_Allreduce        13232                   2081.
 MP_Sync              1064
 MP_Alltoall          2588                4037197.
 MP_SendRecv        168740                  11136.
 MP_ISendRecv        92040                  11136.
 MP_Wait            102830
 MP_comm_split          40
 MP_ISend            26090                  85106.
 MP_IRecv            37890                  59644.
 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
 -                                                                             -
 -                                T I M I N G                                  -
 -                                                                             -
 -------------------------------------------------------------------------------
 SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME
                                MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
 CP2K                                 1  1.0    0.113    0.281  268.384  268.386
 qs_mol_dyn_low                       1  2.0    0.004    0.034  266.729  266.750
 qs_forces                            5  3.8    0.006    0.046  266.549  266.552
 qs_energies                          5  4.8    0.004    0.037  262.815  262.843
 scf_env_do_scf                       5  5.8    0.001    0.019  244.791  244.794
 scf_env_do_scf_inner_loop          105  6.6    0.006    0.092  213.000  213.020
 qs_scf_new_mos                     105  7.6    0.000    0.001  166.514  166.662
 qs_scf_loop_do_ot                  105  8.6    0.001    0.001  166.513  166.661
 ot_scf_mini                        105  9.6    0.003    0.010  156.717  156.852
 dbcsr_multiply_generic            1445 12.2    0.125    0.135  136.296  136.924
 multiply_cannon                   1445 13.2    0.275    0.293  116.055  118.675
 multiply_cannon_loop              1445 14.2    2.395    2.516  113.671  115.551
 velocity_verlet                      4  3.0    0.002    0.011  111.746  111.747
 qs_ot_get_p                        112 10.4    0.001    0.034   65.907   66.225
 ot_mini                            105 10.6    0.001    0.011   60.749   60.865
 qs_ot_p2m_diag                      40 11.0    0.020    0.031   54.233   54.303
 cp_dbcsr_syevd                      40 12.0    0.002    0.010   50.931   50.933
 cp_fm_syevd                         40 13.0    0.001    0.017   44.988   45.151
 multiply_cannon_multrec          69360 15.2   31.607   36.888   39.420   45.016
 mp_waitall_1                    488190 16.1   35.022   42.090   35.022   42.090
 cp_fm_redistribute_end              40 14.0   19.814   39.520   19.821   39.525
 cp_fm_syevd_base                    40 14.0   19.691   39.412   19.691   39.412
 qs_ot_get_derivative                55 11.6    0.001    0.009   38.904   39.028
 multiply_cannon_sync_h2d         69360 15.2   30.509   34.855   30.509   34.855
 multiply_cannon_metrocomm3       69360 15.2    0.197    0.206   24.870   32.826
 rebuild_ks_matrix                  110  8.4    0.000    0.000   31.705   31.889
 qs_ks_build_kohn_sham_matrix       110  9.4    0.012    0.023   31.705   31.888
 init_scf_loop                        7  6.6    0.001    0.013   31.750   31.753
 qs_ks_update_qs_env                112  7.6    0.001    0.001   28.664   28.832
 prepare_preconditioner               7  7.6    0.000    0.000   26.391   26.420
 make_preconditioner                  7  8.6    0.000    0.002   26.391   26.420
 apply_preconditioner_dbcsr          62 12.6    0.000    0.000   23.137   23.380
 apply_single                        62 13.6    0.000    0.000   23.137   23.380
 qs_rho_update_rho_low              110  7.6    0.002    0.121   21.872   22.178
 calculate_rho_elec                 110  8.6    0.030    0.033   21.870   22.177
 ot_new_cg_direction                 55 11.6    0.001    0.019   21.132   21.133
 make_full_inverse_cholesky           7  9.6    0.000    0.000   17.883   17.964
 rs_pw_transfer                     690 11.5    0.011    0.012   16.674   17.907
 density_rs2pw                      110  9.6    0.006    0.008   15.434   16.731
 init_scf_run                         5  5.8    0.000    0.001   15.159   15.160
 scf_env_initial_rho_setup            5  6.8    0.000    0.002   15.159   15.160
 qs_ot_get_orbitals                 105 10.6    0.001    0.001   14.794   15.007
 qs_ot_get_derivative_taylor         37 12.8    0.001    0.001   13.853   13.929
 pw_transfer                       1645 12.4    0.086    0.114   13.219   13.543
 fft_wrap_pw1pw2                   1425 13.5    0.014    0.018   13.073   13.402
 mp_sum_l                          4764 12.2   12.158   12.994   12.158   12.994
 cp_fm_cholesky_invert                7 10.6   12.032   12.046   12.032   12.046
 calculate_dm_sparse                110  9.5    0.000    0.001   11.425   11.629
 fft_wrap_pw1pw2_240                915 15.0    1.252    1.353   11.020   11.340
 sum_up_and_integrate                60 10.3    0.028    0.031   10.905   10.936
 integrate_v_rspace                  60 11.3    0.002    0.004   10.877   10.910
 check_diag                          80 13.5    8.587    8.859   10.643   10.802
 qs_vxc_create                      110 10.4    0.002    0.006   10.701   10.755
 qs_ot_get_derivative_diag           18 12.0    0.000    0.002   10.551   10.609
 cp_dbcsr_sm_fm_multiply             15  9.3    0.001    0.001   10.237   10.253
 calculate_first_density_matrix       1  7.0    0.000    0.004    9.912   10.026
 acc_transpose_blocks             69360 15.2    0.364    0.385    9.391    9.811
 cp_dbcsr_sm_fm_multiply_core        15 10.3    0.000    0.000    9.642    9.715
 fft3d_pb                           915 16.0    2.389    2.761    9.013    9.455
 dbcsr_mm_accdrv_process         154766 15.8    4.017    4.170    7.676    8.498
 make_full_single_inverse             7  9.6    0.015    0.318    8.179    8.493
 make_m2s                          2890 13.2    0.082    0.091    7.697    8.405
 make_images                       2890 14.2    0.243    0.262    7.586    8.293
 acc_transpose_blocks_kernels     69360 16.2    0.852    0.906    7.452    7.753
 xc_rho_set_and_dset_create         110 12.4    0.077    0.097    7.409    7.664
 xc_vxc_pw_create                    60 11.3    0.039    0.050    7.205    7.258
 multiply_cannon_metrocomm1       69360 15.2    0.093    0.099    4.232    7.113
 potential_pw2rs                     60 12.3    0.003    0.003    6.891    6.934
 jit_kernel_transpose                 5 15.0    6.600    6.858    6.600    6.858
 xc_pw_derive                       510 13.4    0.006    0.007    6.402    6.474
 mp_alltoall_z22v                  2340 17.7    6.057    6.438    6.057    6.438
 rs_pw_transfer_RS2PW_30            110 11.6    1.605    1.708    5.643    5.792
 mp_waitany                        7680 13.5    4.392    5.731    4.392    5.731
 mp_sendrecv_dv                  168740 12.6    5.244    5.378    5.244    5.378
 -------------------------------------------------------------------------------
 ~ ~ ~ ~  DATA POINTS  ~ ~ ~ ~
PlotPoint: name="601", plot="h2o_512_md", label="(64n/12r/1t)", y=268.386000, yerr=0.000000
PlotPoint: name="602", plot="h2o_512_md_mem", label="(64n/12r/1t)", y=559.800000, yerr=2.993326
 ~ ~ ~ ~ END DATA POINTS ~ ~ ~
 ~~~~~~ END RESULT ~~~~~~~~


========= END RESULTS ===========

CommitSHA: 1501a5cde42d9e664f251d02093fe2fc81c3abfc
Summary: empty
Status: OK