|
| 1 | +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# SPDX-License-Identifier: MIT-0 |
| 3 | +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 |
| 4 | + |
| 5 | +ARG GDRCOPY_VERSION=v2.4.4 |
| 6 | +ARG EFA_INSTALLER_VERSION=1.38.1 |
| 7 | +ARG AWS_OFI_NCCL_VERSION=v1.14.0 |
| 8 | +ARG NCCL_VERSION=v2.26.2-1 |
| 9 | +ARG NCCL_TESTS_VERSION=v2.14.1 |
| 10 | + |
| 11 | +RUN apt-get update -y && apt-get upgrade -y |
| 12 | +RUN apt-get remove -y --allow-change-held-packages \ |
| 13 | + ibverbs-utils \ |
| 14 | + libibverbs-dev \ |
| 15 | + libibverbs1 \ |
| 16 | + libmlx5-1 \ |
| 17 | + libnccl2 \ |
| 18 | + libnccl-dev |
| 19 | + |
| 20 | +RUN rm -rf /opt/hpcx \ |
| 21 | + && rm -rf /usr/local/mpi \ |
| 22 | + && rm -f /etc/ld.so.conf.d/hpcx.conf \ |
| 23 | + && ldconfig |
| 24 | + |
| 25 | +ENV OPAL_PREFIX= |
| 26 | + |
| 27 | +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ |
| 28 | + apt-utils \ |
| 29 | + autoconf \ |
| 30 | + automake \ |
| 31 | + build-essential \ |
| 32 | + check \ |
| 33 | + cmake \ |
| 34 | + curl \ |
| 35 | + debhelper \ |
| 36 | + devscripts \ |
| 37 | + git \ |
| 38 | + gcc \ |
| 39 | + gdb \ |
| 40 | + kmod \ |
| 41 | + libsubunit-dev \ |
| 42 | + libtool \ |
| 43 | + openssh-client \ |
| 44 | + openssh-server \ |
| 45 | + pkg-config \ |
| 46 | + python3-distutils \ |
| 47 | + vim |
| 48 | +RUN apt-get purge -y cuda-compat-* |
| 49 | + |
| 50 | +RUN mkdir -p /var/run/sshd |
| 51 | +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ |
| 52 | + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ |
| 53 | + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config |
| 54 | + |
| 55 | +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH |
| 56 | +ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH |
| 57 | + |
| 58 | +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ |
| 59 | + && python3 /tmp/get-pip.py \ |
| 60 | + && pip3 install awscli pynvml |
| 61 | + |
| 62 | +################################################# |
| 63 | +## Install NVIDIA GDRCopy |
| 64 | +## |
| 65 | +## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure |
| 66 | +## that the cuda-compat-xx-x package is the latest. |
| 67 | +RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ |
| 68 | + && cd /tmp/gdrcopy \ |
| 69 | + && make prefix=/opt/gdrcopy install |
| 70 | + |
| 71 | +ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:$LD_LIBRARY_PATH |
| 72 | +ENV LIBRARY_PATH /opt/gdrcopy/lib:$LIBRARY_PATH |
| 73 | +ENV CPATH /opt/gdrcopy/include:$CPATH |
| 74 | +ENV PATH /opt/gdrcopy/bin:$PATH |
| 75 | + |
| 76 | +################################################# |
| 77 | +## Install EFA installer |
| 78 | +RUN cd $HOME \ |
| 79 | + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ |
| 80 | + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ |
| 81 | + && cd aws-efa-installer \ |
| 82 | + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ |
| 83 | + && rm -rf $HOME/aws-efa-installer |
| 84 | + |
| 85 | +################################################### |
| 86 | +## Install NCCL |
| 87 | +RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ |
| 88 | + && cd /opt/nccl \ |
| 89 | + && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ |
| 90 | + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90" |
| 91 | + |
| 92 | +################################################### |
| 93 | +## Install AWS-OFI-NCCL plugin |
| 94 | +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev |
| 95 | +#Switch from sh to bash to allow parameter expansion |
| 96 | +SHELL ["/bin/bash", "-c"] |
| 97 | +RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ |
| 98 | + && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ |
| 99 | + && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ |
| 100 | + && ./configure --prefix=/opt/aws-ofi-nccl/install \ |
| 101 | + --with-mpi=/opt/amazon/openmpi \ |
| 102 | + --with-libfabric=/opt/amazon/efa \ |
| 103 | + --with-cuda=/usr/local/cuda \ |
| 104 | + --enable-platform-aws \ |
| 105 | + && make -j $(nproc) \ |
| 106 | + && make install \ |
| 107 | + && cd .. \ |
| 108 | + && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ |
| 109 | + && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz |
| 110 | + |
| 111 | +SHELL ["/bin/sh", "-c"] |
| 112 | + |
| 113 | +################################################### |
| 114 | +## Install NCCL-tests |
| 115 | +RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ |
| 116 | + && cd /opt/nccl-tests \ |
| 117 | + && make -j $(nproc) \ |
| 118 | + MPI=1 \ |
| 119 | + MPI_HOME=/opt/amazon/openmpi/ \ |
| 120 | + CUDA_HOME=/usr/local/cuda \ |
| 121 | + NCCL_HOME=/opt/nccl/build \ |
| 122 | + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90" |
| 123 | + |
| 124 | +RUN rm -rf /var/lib/apt/lists/* |
| 125 | + |
| 126 | +## Set Open MPI variables to exclude network interface and conduit. |
| 127 | +ENV OMPI_MCA_pml=^ucx \ |
| 128 | + OMPI_MCA_btl=tcp,self \ |
| 129 | + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ |
| 130 | + OPAL_PREFIX=/opt/amazon/openmpi \ |
| 131 | + NCCL_SOCKET_IFNAME=^docker,lo,veth |
| 132 | + |
| 133 | +## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 |
| 134 | +ENV PMIX_MCA_gds=hash |
| 135 | + |
| 136 | +## Set LD_PRELOAD for NCCL library |
| 137 | +ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so |
| 138 | + |
| 139 | +# Install Miniconda to not depend on the base image python |
| 140 | +RUN mkdir -p /opt/miniconda3 \ |
| 141 | + && curl -L https://repo.anaconda.com/miniconda/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh -o /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh \ |
| 142 | + && bash /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh -b -f -p /opt/miniconda3 \ |
| 143 | + && rm /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh \ |
| 144 | + && /opt/miniconda3/bin/conda init bash |
| 145 | + |
| 146 | +ENV PATH="/opt/miniconda3/bin:${PATH}" |
| 147 | + |
| 148 | +ENV TORCH_CUDA_ARCH_LIST="9.0a" |
| 149 | + |
| 150 | +# for colossalai we need torch <= 2.5.1: |
| 151 | +RUN pip install torch==2.5.1 |
| 152 | + |
| 153 | +# because of https://discuss.huggingface.co/t/valueerror-unable-to-avoid-copy-while-creating-an-array-as-requested/93584/5 |
| 154 | +RUN pip install "numpy<2.0" |
| 155 | + |
| 156 | +# Install tensornvme: |
| 157 | +RUN apt update -y && apt install -y libaio-dev && pip install tensornvme |
| 158 | + |
| 159 | +# to use the fused RMSNorm kernel colossalai needs apex built from source: |
| 160 | +RUN git clone https://github.com/NVIDIA/apex /tmp/apex && \ |
| 161 | + cd /tmp/apex && \ |
| 162 | + NVCC_APPEND_FLAGS="--threads 4" \ |
| 163 | + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext --cuda_ext --parallel 8" ./ |
| 164 | + |
| 165 | +RUN MAX_JOBS=48 pip install flash-attn --no-build-isolation |
| 166 | + |
| 167 | +RUN git clone https://github.com/hpcaitech/ColossalAI.git /tmp/colossalai && \ |
| 168 | + cd /tmp/colossalai && \ |
| 169 | + git checkout 46ed5d856b16b074325091a88e761544b3d4f9f0 && \ |
| 170 | + # BUILD_EXT=1 FORCE_CUDA=1 |
| 171 | + pip install . && \ |
| 172 | + cd applications/ColossalChat && \ |
| 173 | + pip install . |
0 commit comments