DeepSeek V3/R1/Prover-V2 671B SFT with LoRA

pbelevich · pbelevich · commit d6c8c602efb9 · 2025-05-22T22:07:28.000Z
diff --git a/3.test_cases/pytorch/colossalai/README.md b/3.test_cases/pytorch/colossalai/README.md
@@ -0,0 +1,35 @@
+# Colossal-AI
+
+## Dependencies
+
+As of Apr 18th 2025 [commit](https://github.com/hpcaitech/ColossalAI/tree/46ed5d856b16b074325091a88e761544b3d4f9f0) ColosalAI required PyTorch 2.5.1 which official builds use CUDA 12.4. We use `nvidia/cuda:12.4.1-devel-ubuntu22.04` as the base image and install all dependencies on top of it in [colossalai.Dockerfile](colossalai.Dockerfile).
+
+## Build Docker Image
+
+Building Colossal-AI from scratch requires GPU support, you need to use Nvidia Docker Runtime as the default when doing docker build. We launch the build job on the GPU node:
+
+Login to AWS ECR:
+```bash
+export AWS_ACCESS_KEY_ID=...
+export AWS_SECRET_ACCESS_KEY=...
+
+aws ecr get-login-password ...
+```
+
+Build the docker image on the GPU node and push it to the docker repo:
+```bash
+export DOCKER_REPO=159553542841.dkr.ecr.ap-northeast-1.amazonaws.com/belevich/colossalai
+srun ./build_docker.sh
+```
+
+Take docker image from the docker repo:
+```bash
+docker pull $DOCKER_REPO:latest
+```
+
+Import the docker image to an enroot container(maybe remove previous created `rm ./colossalai.sqsh`):
+```bash
+enroot import -o ./colossalai.sqsh  dockerd://$DOCKER_REPO:latest
+```
+
+
diff --git a/3.test_cases/pytorch/colossalai/build_docker.sh b/3.test_cases/pytorch/colossalai/build_docker.sh
@@ -0,0 +1,12 @@
+#! /bin/bash
+
+docker build --progress=plain -f colossalai.Dockerfile -t colossalai:latest .
+
+if [ -z "$DOCKER_REPO" ]; then
+    echo "DOCKER_REPO is not set"
+    exit 1
+fi
+
+docker tag colossalai:latest $DOCKER_REPO:latest
+
+docker push $DOCKER_REPO:latest
diff --git a/3.test_cases/pytorch/colossalai/colossalai.Dockerfile b/3.test_cases/pytorch/colossalai/colossalai.Dockerfile
@@ -0,0 +1,173 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+ARG GDRCOPY_VERSION=v2.4.4
+ARG EFA_INSTALLER_VERSION=1.38.1
+ARG AWS_OFI_NCCL_VERSION=v1.14.0
+ARG NCCL_VERSION=v2.26.2-1
+ARG NCCL_TESTS_VERSION=v2.14.1
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH /opt/gdrcopy/include:$CPATH
+ENV PATH /opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
+
+###################################################
+## Install AWS-OFI-NCCL plugin
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
+#Switch from sh to bash to allow parameter expansion
+SHELL ["/bin/bash", "-c"]
+RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
+    && ./configure --prefix=/opt/aws-ofi-nccl/install \
+        --with-mpi=/opt/amazon/openmpi \
+        --with-libfabric=/opt/amazon/efa \
+        --with-cuda=/usr/local/cuda \
+        --enable-platform-aws \
+    && make -j $(nproc) \
+    && make install \
+    && cd .. \
+    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
+    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
+
+SHELL ["/bin/sh", "-c"]
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so
+
+# Install Miniconda to not depend on the base image python
+RUN mkdir -p /opt/miniconda3 \
+    && curl -L https://repo.anaconda.com/miniconda/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh -o /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh \
+    && bash /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh -b -f -p /opt/miniconda3 \
+    && rm /tmp/Miniconda3-py312_25.3.1-1-Linux-x86_64.sh \
+    && /opt/miniconda3/bin/conda init bash
+
+ENV PATH="/opt/miniconda3/bin:${PATH}"
+
+ENV TORCH_CUDA_ARCH_LIST="9.0a"
+
+# for colossalai we need torch <= 2.5.1:
+RUN pip install torch==2.5.1
+
+# because of  https://discuss.huggingface.co/t/valueerror-unable-to-avoid-copy-while-creating-an-array-as-requested/93584/5
+RUN pip install "numpy<2.0"
+
+# Install tensornvme:
+RUN apt update -y && apt install -y libaio-dev && pip install tensornvme
+
+# to use the fused RMSNorm kernel colossalai needs apex built from source:
+RUN git clone https://github.com/NVIDIA/apex /tmp/apex && \
+    cd /tmp/apex && \
+    NVCC_APPEND_FLAGS="--threads 4" \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext --cuda_ext --parallel 8" ./
+
+RUN MAX_JOBS=48 pip install flash-attn --no-build-isolation
+
+RUN git clone https://github.com/hpcaitech/ColossalAI.git /tmp/colossalai && \
+    cd /tmp/colossalai && \
+    git checkout 46ed5d856b16b074325091a88e761544b3d4f9f0 && \
+    # BUILD_EXT=1 FORCE_CUDA=1 
+    pip install . && \
+    cd applications/ColossalChat && \
+    pip install .
diff --git a/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/.gitignore b/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/.gitignore
@@ -0,0 +1,2 @@
+DeepSeek-V3
+logs
diff --git a/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/README.md b/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/README.md
@@ -0,0 +1,63 @@
+# DeepSeek V3/R1/Prover-V2 671B SFT with LoRA
+
+This example uses Colossal-AI container from the parent directory
+
+## Download model weights
+
+```bash
+pip install -U "huggingface_hub[cli]"
+```
+
+Choose the model you want to finetune:
+
+ - deepseek-ai/DeepSeek-V3
+ - deepseek-ai/DeepSeek-V3-0324
+ - deepseek-ai/DeepSeek-R1
+ - deepseek-ai/DeepSeek-Prover-V2-671B
+
+and define model name environment variable, for example:
+```bash
+export MODEL_NAME="deepseek-ai/DeepSeek-Prover-V2-671B"
+```
+
+Download the model weights from Hugging Face and find the model path:
+```bash
+huggingface-cli download $MODEL_NAME
+export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"`
+export HF_HOME=${HF_HOME:-$(python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent.parent.parent.parent.parent)")}
+```
+
+## Convert fp8 weights to bf16
+
+Since the model weights are fp8 and SFT requires bf16 weights, we use `convert_to_bf16.py` from `DeepSeek-V3` repo to convert the weights to bf16:
+
+Clone DeepSeek V3 repo:
+```bash
+git clone https://github.com/deepseek-ai/DeepSeek-V3.git
+```
+Launch a job on the GPU node:
+```bash
+srun \
+    --container-image ../colossalai.sqsh \
+    --container-mounts ./:/workdir,$HF_HOME:$HF_HOME \
+    python /workdir/DeepSeek-V3/inference/fp8_cast_bf16.py \
+        --input-fp8-hf-path $MODEL_PATH \
+        --output-bf16-hf-path /workdir/$MODEL_NAME-bf16
+```
+
+## Launch LoRA finetuning
+
+```bash
+sbatch lora_finetune.sbatch
+```
+Check the logs:
+```bash
+tail -f -n +0 slurm-XXX.out
+```
+
+
+
+
+
+
+
diff --git a/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/lora_finetune.py b/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/lora_finetune.py
diff --git a/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/lora_finetune.sbatch b/3.test_cases/pytorch/colossalai/deepseek-lora-finetune/lora_finetune.sbatch