1. 前言
本文讲述如何通过设置编译参数和依赖库,在 Docker 环境中编译极致性能的 Quantum ESPRESSO 6.3。
使用的相关软件:
所用软件除 ELPA 外均为最新版(QE 6.3 暂不支持更高版本的 ELPA)。
注:本文内容理论上也适用于 Docker for Linux/Mac。
2. 安装步骤
选择基础镜像,安装 PS XE。
FROM ubuntu:17.04
MAINTAINER itianda <me#itianda.com>
ARG PS=parallel_studio_xe_2018_update3_cluster_edition
RUN \
tar -xzf psxe/$PS.tgz && \
cd $PS && \
mkdir /opt/intel && \
cp ../psxe/psxe.lic /opt/intel/licenses && \
./install.sh --silent=../psxe/silent.cfg
ARG TOPROOT=/opt/intel
ARG INTELROOT=$TOPROOT/compilers_and_libraries/linux
ENV MKLROOT=$INTELROOT/mkl
ENV TBBROOT=$INTELROOT/tbb
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib/x86_64-linux-gnu/:/lib
ENV LD_LIBRARY_PATH=$INTELROOT/lib/intel64:$MKLROOT/lib/intel64:$TBBROOT/lib/intel64:$LD_LIBRARY_PATH
ENV PATH=$TOPROOT/bin:$PATH
注:Ubuntu 17.04 是 PS XE 2018 支持的最高版本。
silent.cfg 的内容:
ACCEPT_EULA=accept
CONTINUE_WITH_OPTIONAL_ERROR=yes
PSET_INSTALL_DIR=/opt/intel
CONTINUE_WITH_INSTALLDIR_OVERWRITE=yes
PSET_MODE=install
ACTIVATION_TYPE=exist_lic
AMPLIFIER_SAMPLING_DRIVER_INSTALL_TYPE=kit
AMPLIFIER_DRIVER_ACCESS_GROUP=vtune
AMPLIFIER_DRIVER_PERMISSIONS=666
AMPLIFIER_LOAD_DRIVER=no
AMPLIFIER_C_COMPILER=none
AMPLIFIER_KERNEL_SRC_DIR=none
AMPLIFIER_MAKE_COMMAND=none
AMPLIFIER_INSTALL_BOOT_SCRIPT=no
AMPLIFIER_DRIVER_PER_USER_MODE=no
INTEL_SW_IMPROVEMENT_PROGRAM_CONSENT=no
ARCH_SELECTED=ALL
COMPONENTS=;intel-comp__x86_64;intel-comp-32bit__x86_64;intel-comp-doc__noarch;intel-comp-l-all-common__noarch;intel-comp-l-all-vars__noarch;intel-comp-nomcu-vars__noarch;intel-comp-ps-32bit__x86_64;intel-comp-ps__x86_64;intel-comp-ps-ss__x86_64;intel-comp-ps-ss-bec__x86_64;intel-comp-ps-ss-bec-32bit__x86_64;intel-openmp__x86_64;intel-openmp-32bit__x86_64;intel-openmp-common__noarch;intel-openmp-common-icc__noarch;intel-openmp-common-ifort__noarch;intel-openmp-ifort__x86_64;intel-openmp-ifort-32bit__x86_64;intel-tbb-libs-32bit__x86_64;intel-tbb-libs__x86_64;intel-idesupport-icc-common-ps__noarch;intel-icc__x86_64;intel-icc-32bit__x86_64;intel-c-comp-common__noarch;intel-icc-common__noarch;intel-icc-common-ps__noarch;intel-icc-common-ps-ss-bec__noarch;intel-icc-doc__noarch;intel-icc-doc-ps__noarch;intel-icc-ps__x86_64;intel-icc-ps-ss__x86_64;intel-icc-ps-ss-bec__x86_64;intel-icc-ps-ss-bec-32bit__x86_64;intel-ifort__x86_64;intel-ifort-32bit__x86_64;intel-ifort-common__noarch;intel-ifort-doc__noarch;intel-mkl-common__noarch;intel-mkl-core-32bit__x86_64;intel-mkl-core__x86_64;intel-mkl-core-rt-32bit__x86_64;intel-mkl-core-rt__x86_64;intel-mkl-doc__noarch;intel-mkl-doc-ps__noarch;intel-mkl-gnu-32bit__x86_64;intel-mkl-gnu__x86_64;intel-mkl-gnu-rt-32bit__x86_64;intel-mkl-gnu-rt__x86_64;intel-mkl-cluster__x86_64;intel-mkl-cluster-common__noarch;intel-mkl-cluster-rt__x86_64;intel-mkl-common-ps__noarch;intel-mkl-core-ps-32bit__x86_64;intel-mkl-core-ps__x86_64;intel-mkl-pgi__x86_64;intel-mkl-pgi-rt__x86_64;intel-mkl-common-c__noarch;intel-mkl-core-c-32bit__x86_64;intel-mkl-core-c__x86_64;intel-mkl-common-c-ps__noarch;intel-mkl-cluster-c__noarch;intel-mkl-tbb-32bit__x86_64;intel-mkl-tbb__x86_64;intel-mkl-tbb-rt-32bit__x86_64;intel-mkl-tbb-rt__x86_64;intel-mkl-pgi-c__x86_64;intel-mkl-gnu-c-32bit__x86_64;intel-mkl-gnu-c__x86_64;intel-mkl-common-f__noarch;intel-mkl-core-f-32bit__x86_64;intel-mkl-core-f__x86_64;intel-mkl-cluster-f__noarch;intel-mkl-pgi-f__x86_64;intel-mkl-gnu-f-rt-32bit__x86_64;intel-mkl-gnu-f-rt__x86_64;intel-mkl-gnu-f__x86_64;intel-mkl-gnu-f-32bit__x86_64;intel-mkl-f95-common__noarch;intel-mkl-f95-32bit__x86_64;intel-mkl-f__x86_64;intel-tbb-devel-32bit__x86_64;intel-tbb-devel__x86_64;intel-tbb-common__noarch;intel-tbb-doc__noarch;intel-ism__noarch;intel-icsxe__noarch;intel-psxe-common__noarch;intel-psxe-doc__noarch;intel-psxe-common-doc__noarch;intel-icsxe-doc__noarch;intel-psxe-licensing__noarch;intel-psxe-licensing-doc__noarch;intel-icsxe-pset
安装相关依赖包。
RUN \
apt-get update -y && \
apt-get upgrade -y && \
apt-get install -y cpio wget make gcc g++ python ssh autotools-dev autoconf automake texinfo libtool patch flex
设置环境变量:
ENV COMPILERVARS_ARCHITECTURE="intel64"
ENV COMPILERVARS_PLATFORM="linux"
指定编译器选项:
ARG TARGET="SKYLAKE"
ARG CCFLAGS="-O3 -no-prec-div -fp-model fast=2 -x${TARGET}"
ARG FCFLAGS="-O3 -no-prec-div -fp-model fast=2 -x${TARGET} -align array64byte -threads -heap-arrays 4096"
编译 Open MPI:
RUN \
cd $OMPI_DIR && \
. compilervars.sh && \
./autogen.pl && \
./configure \
--with-cma="no" \
CC="icc" \
CXX="icpc" \
FC="ifort" \
CFLAGS="${CCFLAGS}" \
CXXFLAGS="${CCFLAGS}" \
FCFLAGS="${FCFLAGS}" \
&& \
make -j && \
make install
编译 ELPA:
RUN \
cd $ELPA_DIR && \
. compilervars.sh && \
autoconf && \
./configure \
--enable-option-checking=fatal \
--prefix=$ELPAROOT \
AR="xiar" \
FC="mpifort" \
CC="mpicc" \
CXX="mpicpc" \
CFLAGS="${CCFLAGS}" \
CXXFLAGS="${CCFLAGS}" \
FCFLAGS="${FCFLAGS}" \
ACLOCAL="aclocal" \
AUTOCONF='autoconf' \
AUTOHEADER='autoheader' \
AUTOMAKE='automake' \
MAKEINFO="makeinfo" \
SCALAPACK_LDFLAGS="-L${MKLROOT}/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_openmpi_lp64 -Wl,-rpath,${MKLROOT}/lib/intel64" \
SCALAPACK_FCFLAGS="-L${MKLROOT}/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_openmpi_lp64 -I${MKLROOT}/include/intel64/lp64" \
&& \
make -j && \
make install
编译 QE:
RUN \
ln -s q-e-$QE_DIR $QE_DIR && \
cd $QE_DIR && \
. compilervars.sh && \
./configure \
AR="xiar" \
MPIF90="mpifort" \
CC="mpicc" \
CFLAGS="${CCFLAGS}" \
FFLAGS="${FCFLAGS} -I${MKLROOT}/include -I${MKLROOT}/include/fftw" \
LDFLAGS="-Wl,--start-group \
${MKLROOT}/lib/intel64/libmkl_intel_lp64.a \
${MKLROOT}/lib/intel64/libmkl_core.a \
${MKLROOT}/lib/intel64/libmkl_sequential.a \
${MKLROOT}/lib/intel64/libmkl_blacs_openmpi_lp64.a \
${MKLROOT}/lib/intel64/libmkl_scalapack_lp64.a \
-Wl,--end-group" \
--with-elpa-include="${ELPAROOT}/include/${ELPA_DIR}/modules" \
--with-elpa-lib="${ELPAROOT}/lib/libelpa.a" \
--with-elpa-version=2016 && \
make all
3. 性能对比
为了对比性能,默认编译版本采用 GCC 编译,同样使用 Open MPI 实现并行计算,但不使用 ELPA 和 Intel MKL。
分别使用默认编译版本和优化编译版本进行简单自洽计算,计算时间分别为2m 5.57s
和55.82s
,优化编译版本足足快了一倍!
默认编译版本输出结果:
highest occupied level (ev): 21.1832
! total energy = -1259.25185017 Ry
Harris-Foulkes estimate = -1259.25185020 Ry
estimated scf accuracy < 0.00000003 Ry
The total energy is the sum of the following terms:
one-electron contribution = -125.55179190 Ry
hartree contribution = 164.79277944 Ry
xc contribution = -169.12506962 Ry
ewald contribution = -1129.36776808 Ry
convergence has been achieved in 12 iterations
Writing output data file scf.save/
init_run : 3.27s CPU 3.49s WALL ( 1 calls)
electrons : 121.94s CPU 129.38s WALL ( 1 calls)
Called by init_run:
wfcinit : 2.06s CPU 2.24s WALL ( 1 calls)
potinit : 0.35s CPU 0.37s WALL ( 1 calls)
hinit0 : 0.74s CPU 0.78s WALL ( 1 calls)
Called by electrons:
c_bands : 111.74s CPU 118.15s WALL ( 12 calls)
sum_band : 8.83s CPU 9.62s WALL ( 12 calls)
v_of_rho : 0.83s CPU 0.87s WALL ( 13 calls)
newd : 0.48s CPU 0.53s WALL ( 13 calls)
mix_rho : 0.09s CPU 0.10s WALL ( 12 calls)
Called by c_bands:
init_us_2 : 0.67s CPU 0.59s WALL ( 375 calls)
cegterg : 106.89s CPU 112.99s WALL ( 180 calls)
Called by sum_band:
sum_band:bec : 0.01s CPU 0.01s WALL ( 180 calls)
addusdens : 0.65s CPU 0.67s WALL ( 12 calls)
Called by *egterg:
h_psi : 46.89s CPU 51.33s WALL ( 1227 calls)
s_psi : 8.79s CPU 9.16s WALL ( 1227 calls)
g_psi : 0.14s CPU 0.14s WALL ( 1032 calls)
cdiaghg : 24.53s CPU 25.18s WALL ( 1212 calls)
Called by h_psi:
h_psi:pot : 46.75s CPU 51.07s WALL ( 1227 calls)
h_psi:calbec : 9.90s CPU 10.38s WALL ( 1227 calls)
vloc_psi : 27.52s CPU 31.41s WALL ( 1227 calls)
add_vuspsi : 9.31s CPU 9.27s WALL ( 1227 calls)
General routines
calbec : 12.87s CPU 13.52s WALL ( 1407 calls)
fft : 0.39s CPU 0.47s WALL ( 168 calls)
ffts : 0.03s CPU 0.01s WALL ( 25 calls)
fftw : 28.64s CPU 32.93s WALL ( 97364 calls)
interpolate : 0.06s CPU 0.08s WALL ( 13 calls)
Parallel routines
fft_scatt_xy : 3.42s CPU 3.78s WALL ( 97557 calls)
fft_scatt_yz : 8.70s CPU 10.91s WALL ( 97557 calls)
PWSCF : 2m 5.57s CPU 2m14.60s WALL
优化编译版本输出结果:
highest occupied level (ev): 21.1832
! total energy = -1259.25185017 Ry
Harris-Foulkes estimate = -1259.25185020 Ry
estimated scf accuracy < 0.00000003 Ry
The total energy is the sum of the following terms:
one-electron contribution = -125.55179190 Ry
hartree contribution = 164.79277944 Ry
xc contribution = -169.12506962 Ry
ewald contribution = -1129.36776808 Ry
convergence has been achieved in 12 iterations
Writing output data file scf.save/
init_run : 1.14s CPU 1.31s WALL ( 1 calls)
electrons : 54.36s CPU 57.23s WALL ( 1 calls)
Called by init_run:
wfcinit : 0.86s CPU 0.96s WALL ( 1 calls)
potinit : 0.09s CPU 0.11s WALL ( 1 calls)
hinit0 : 0.15s CPU 0.18s WALL ( 1 calls)
Called by electrons:
c_bands : 48.91s CPU 51.33s WALL ( 12 calls)
sum_band : 4.63s CPU 5.00s WALL ( 12 calls)
v_of_rho : 0.50s CPU 0.50s WALL ( 13 calls)
newd : 0.23s CPU 0.24s WALL ( 13 calls)
mix_rho : 0.11s CPU 0.10s WALL ( 12 calls)
Called by c_bands:
init_us_2 : 0.59s CPU 0.52s WALL ( 375 calls)
cegterg : 46.47s CPU 48.67s WALL ( 180 calls)
Called by sum_band:
sum_band:bec : 0.00s CPU 0.01s WALL ( 180 calls)
addusdens : 0.24s CPU 0.24s WALL ( 12 calls)
Called by *egterg:
h_psi : 23.69s CPU 25.21s WALL ( 1227 calls)
s_psi : 2.16s CPU 2.48s WALL ( 1227 calls)
g_psi : 0.03s CPU 0.10s WALL ( 1032 calls)
cdiaghg : 13.07s CPU 13.43s WALL ( 1212 calls)
Called by h_psi:
h_psi:pot : 23.42s CPU 24.96s WALL ( 1227 calls)
h_psi:calbec : 2.54s CPU 2.73s WALL ( 1227 calls)
vloc_psi : 18.39s CPU 19.73s WALL ( 1227 calls)
add_vuspsi : 2.48s CPU 2.49s WALL ( 1227 calls)
General routines
calbec : 3.16s CPU 3.47s WALL ( 1407 calls)
fft : 0.40s CPU 0.37s WALL ( 168 calls)
ffts : 0.01s CPU 0.01s WALL ( 25 calls)
fftw : 19.18s CPU 20.54s WALL ( 97364 calls)
interpolate : 0.03s CPU 0.04s WALL ( 13 calls)
Parallel routines
fft_scatt_xy : 2.45s CPU 2.63s WALL ( 97557 calls)
fft_scatt_yz : 5.49s CPU 5.84s WALL ( 97557 calls)
PWSCF : 55.82s CPU 1m 0.23s WALL
4. 注意事项
镜像默认的 LD_LIBRARY_PATH
未设置,需手动指定。
有时会因系统资源不足而出现编译器内部错误。此时可以增加内存,或取消并行编译(取消 make -j
选项)。
QE 不能完全并行编译,但是单独的模块可以:make -j pw
。
编译 Open MPI 时需要设置 ./configure --with-cma="no"
,不然会一直输出:
Read -1, expected ###, errno =1
Read -1, expected ###, errno =1
Read -1, expected ###, errno =1
...
Open MPI 禁止 root 用户直接调用,这会导致 QE 测试运行失败。建议创建新用户。
2018.08.28 修正配置 QE 时的参数FCFLAGS=...
为FFLAGS=...
。