====vasp====

For use by authorized users only.  

Local copy 5.4.4 (for Intel) is modified from distribution as follows:
<code>
installs$ diff -r VASP-REFERENCE/vasp.5.4.4/src VASP/5.4.4/vasp.5.4.4/build/std | grep -v "\.o$" | grep -v "\.f90$" | grep -v "\.mod$"
Only in VASP-REFERENCE/vasp.5.4.4/src: CUDA
Only in VASP-REFERENCE/vasp.5.4.4/src: fftlib
Only in VASP/5.4.4/vasp.5.4.4/build/std/lib: libdmy.a
Only in VASP/5.4.4/vasp.5.4.4/build/std/lib: makefile.include
diff -r VASP-REFERENCE/vasp.5.4.4/src/makefile VASP/5.4.4/vasp.5.4.4/build/std/makefile
18c18
< LLIB=-Llib -ldmy
---
> LLIB=lib/getshmem.o lib/linpack_double.o -Llib -ldmy
131c131
< #OBJCTS_f90=$(filter-out getshmem.o, $(OBJCTS))
---
> OBJCTS_f90=$(filter-out getshmem.o, $(OBJCTS))
Only in VASP/5.4.4/vasp.5.4.4/build/std: makefile.include
Only in VASP/5.4.4/vasp.5.4.4/build/std/parser: libparser.a
Only in VASP/5.4.4/vasp.5.4.4/build/std/parser: makefile.include
Only in VASP-REFERENCE/vasp.5.4.4/src: README
Only in VASP-REFERENCE/vasp.5.4.4/src: vasp.cfg
Only in VASP/5.4.4/vasp.5.4.4/build/std: vasp.O2.smkl
Only in VASP/5.4.4/vasp.5.4.4/build/std: vasp.O3.pmkl
Only in VASP/5.4.4/vasp.5.4.4/build/std: vasp.O3.smkl
installs$
</code>
''getshmem.o'' and ''linpack_double.o'' are made in build/lib, and modules and ''makefile.include'' are:
<code>
installs$ module load intel/19.0.5 mkl/19.0.5 impi/19.0.5
installs$ cat VASP/5.4.4/vasp.5.4.4/build/std/makefile.include
# Precompiler options
CPP_OPTIONS= -DHOST=\"LinuxIFC\"\
             -DMPI -DMPI_BLOCK=8000 \
             -Duse_collective \
             -DscaLAPACK \
             -DCACHE_SIZE=4000 \
             -Davoidalloc \
             -Duse_bse_te \
             -Dtbdyn \
             -Duse_shmem

CPP        = fpp -f_com=no -free -w0  $*$(FUFFIX) $*$(SUFFIX) $(CPP_OPTIONS)

FC         = mpiifort
FCL        = mpiifort -mkl=sequential -lstdc++

FREE       = -free -names lowercase

FFLAGS     = -assume byterecl -w
OFLAG      = -O2 -xHOST -qopenmp
OFLAG      = -O3 -xsse3 -axsse4.2,AVX,COREAVX512 -qopenmp
OFLAG      = -O2 -xsse3 -axsse4.2,AVX,COREAVX512 -qopenmp
OFLAG_IN   = $(OFLAG)
DEBUG      = -O0

MKL_PATH   = $(MKLROOT)/lib/intel64
BLAS       =
LAPACK     =
BLACS      = -lmkl_blacs_intelmpi_lp64
SCALAPACK  = $(MKL_PATH)/libmkl_scalapack_lp64.a $(BLACS)

OBJECTS    = fftmpiw.o fftmpi_map.o fft3dlib.o fftw3d.o

INCS       =-I$(MKLROOT)/include/fftw

LLIBS      = $(SCALAPACK) $(LAPACK) $(BLAS)


OBJECTS_O1 += fftw3d.o fftmpi.o fftmpiw.o
OBJECTS_O2 += fft3dlib.o

# For what used to be vasp.5.lib
CPP_LIB    = $(CPP)
FC_LIB     = $(FC)
CC_LIB     = icc
CFLAGS_LIB = -O
FFLAGS_LIB = -O1
FREE_LIB   = $(FREE)

OBJECTS_LIB= linpack_double.o getshmem.o

# For the parser library
CXX_PARS   = icpc

LIBS       += parser
LLIBS      += -Lparser -lparser -lstdc++

# Normally no need to change this
SRCDIR     = ../../src
BINDIR     = ../../bin

#================================================
# GPU Stuff

#CPP_GPU    = -DCUDA_GPU -DRPROMU_CPROJ_OVERLAP -DUSE_PINNED_MEMORY -DCUFFT_MIN=28 -UscaLAPACK

#OBJECTS_GPU = fftmpiw.o fftmpi_map.o fft3dlib.o fftw3d_gpu.o fftmpiw_gpu.o

#CC         = icc
#CXX        = icpc
#CFLAGS     = -fPIC -DADD_ -Wall -openmp -DMAGMA_WITH_MKL -DMAGMA_SETAFFINITY -DGPUSHMEM=300 -DHAVE_CUBLAS

#CUDA_ROOT  ?= /usr/local/cuda/
#NVCC       := $(CUDA_ROOT)/bin/nvcc -ccbin=icc
#CUDA_LIB   := -L$(CUDA_ROOT)/lib64 -lnvToolsExt -lcudart -lcuda -lcufft -lcublas

#GENCODE_ARCH    := -gencode=arch=compute_30,code=\"sm_30,compute_30\" \
                   -gencode=arch=compute_35,code=\"sm_35,compute_35\" \
                   -gencode=arch=compute_60,code=\"sm_60,compute_60\"

MPI_INC    = $(I_MPI_ROOT)/intel64/include/
installs$ 
</code>
Some run examples with an OpenMP, MKL sequential build:
<code>
module load intel/19.0.5 mkl/19.0.5 impi/19.0.5
mpirun -np 16 -genv OMP_NUM_THREADS=2 /scrfs/apps/vasp/vasp.5.4.4/build/std/vasp.O2.smkl
took 75 seconds on Trestles, 35 seconds on Razor 16-core (with OMP_NUM_THREADS=1)**, 27.3 seconds on Pinnacle.
vasp.o2.smkl took 26.1 seconds on Pinnacle.
</code>
**For performance MPI threads x OpenMP threads should not exceed physical cores.