|
| 1 | +# Multi-Node Deployment |
| 2 | +This example demonstrates how to launch a 32-GPU (NPU) deployment across 2 machines. |
| 3 | +Launching Services on the First Machine: |
| 4 | +```shell |
| 5 | +bash start_deepseek_machine_1.sh |
| 6 | +``` |
| 7 | + |
| 8 | +The start_deepseek_machine_1.sh script is as follows: |
| 9 | +```bash title="start_deepseek_machine_1.sh" linenums="1" |
| 10 | +# 1. Environment variable setup |
| 11 | +export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')" |
| 12 | +export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')" |
| 13 | +export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU version PyTorch path |
| 14 | +export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch installation path |
| 15 | +export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch path |
| 16 | +export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # Add NPU library path |
| 17 | + |
| 18 | +# 2. Load npu environment |
| 19 | +source /usr/local/Ascend/ascend-toolkit/set_env.sh |
| 20 | +source /usr/local/Ascend/nnal/atb/set_env.sh |
| 21 | +export ASDOPS_LOG_TO_STDOUT=1 |
| 22 | +export ASDOPS_LOG_LEVEL=ERROR |
| 23 | +export ASDOPS_LOG_TO_FILE=1 |
| 24 | +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True |
| 25 | +export NPU_MEMORY_FRACTION=0.98 |
| 26 | +export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3 |
| 27 | +export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 |
| 28 | +export OMP_NUM_THREADS=12 |
| 29 | +export HCCL_CONNECT_TIMEOUT=7200 |
| 30 | +export INF_NAN_MODE_ENABLE=0 |
| 31 | + |
| 32 | +# 3. Clean up old logs |
| 33 | +\rm -rf core.* |
| 34 | + |
| 35 | +# 4. Start distributed service |
| 36 | +MODEL_PATH="/path/to/your/DeepSeek-R1" # Model path |
| 37 | +MASTER_NODE_ADDR="123.123.123.123:9748" # Master node address (must be globally consistent) |
| 38 | +LOCAL_HOST=123.123.123.123 # Local IP for service launch |
| 39 | +START_PORT=18000 # Service starting port |
| 40 | +START_DEVICE=0 # Starting NPU logical device number |
| 41 | +LOCAL_NODES=16 # Number of local processes (this script launches 16 processes) |
| 42 | +NNODES=32 # Total number of GPUs/NPUs (32 in this 2-machine example) |
| 43 | + |
| 44 | +export HCCL_IF_BASE_PORT=43432 # HCCL communication base port |
| 45 | + |
| 46 | +for (( i=0; i<$LOCAL_NODES; i++ )) |
| 47 | +do |
| 48 | + PORT=$((START_PORT + i)) |
| 49 | + DEVICE=$((START_DEVICE + i)) |
| 50 | + LOG_FILE="$LOG_DIR/node_$i.log" |
| 51 | + ./xllm/build/xllm/core/server/xllm \ |
| 52 | + --model $MODEL_PATH \ |
| 53 | + --host $LOCAL_HOST \ |
| 54 | + --port $PORT \ |
| 55 | + --devices="npu:$DEVICE" \ |
| 56 | + --master_node_addr=$MASTER_NODE_ADDR \ |
| 57 | + --nnodes=$NNODES \ |
| 58 | + --max_memory_utilization=0.86 \ |
| 59 | + --max_tokens_per_batch=40000 \ |
| 60 | + --max_seqs_per_batch=256 \ |
| 61 | + --enable_mla=true \ |
| 62 | + --block_size=128 \ |
| 63 | + --enable_prefix_cache=false \ |
| 64 | + --enable_chunked_prefill=false \ |
| 65 | + --communication_backend="hccl" \ |
| 66 | + --enable_schedule_overlap=true \ |
| 67 | + --rank_tablefile=./ranktable_2s_32p.json \ |
| 68 | + --node_rank=$i & |
| 69 | +done |
| 70 | +``` |
| 71 | + |
| 72 | +Launching Services on the Second Machine: |
| 73 | +```shell |
| 74 | +bash start_deepseek_machine_2.sh |
| 75 | +``` |
| 76 | + |
| 77 | +The start_deepseek_machine_2.sh script is as follows: |
| 78 | +```bash title="start_deepseek_machine_2.sh" linenums="1" |
| 79 | +# 1. Environment variable setup |
| 80 | +export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')" |
| 81 | +export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')" |
| 82 | +export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU version PyTorch path |
| 83 | +export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch installation path |
| 84 | +export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch path |
| 85 | +export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # Add NPU library path |
| 86 | + |
| 87 | +# 2. Load npu environment |
| 88 | +source /usr/local/Ascend/ascend-toolkit/set_env.sh |
| 89 | +source /usr/local/Ascend/nnal/atb/set_env.sh |
| 90 | +export ASDOPS_LOG_TO_STDOUT=1 |
| 91 | +export ASDOPS_LOG_LEVEL=ERROR |
| 92 | +export ASDOPS_LOG_TO_FILE=1 |
| 93 | +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True |
| 94 | +export NPU_MEMORY_FRACTION=0.98 |
| 95 | +export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3 |
| 96 | +export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 |
| 97 | +export OMP_NUM_THREADS=12 |
| 98 | +export HCCL_CONNECT_TIMEOUT=7200 |
| 99 | +export INF_NAN_MODE_ENABLE=0 |
| 100 | + |
| 101 | +# 3. Clean up old logs |
| 102 | +\rm -rf core.* |
| 103 | + |
| 104 | +# 4. Start distributed service |
| 105 | +MODEL_PATH="/path/to/your/DeepSeek-R1" # Model path |
| 106 | +MASTER_NODE_ADDR="123.123.123.123:9748" # Master node address (must be globally consistent) |
| 107 | +LOCAL_HOST=456.456.456.456 # Local IP for service launch |
| 108 | +START_PORT=18000 # Service starting port |
| 109 | +START_DEVICE=0 # Starting NPU logical device number |
| 110 | +LOCAL_NODES=16 # Number of local processes (this script launches 16 processes) |
| 111 | +NNODES=32 # Total number of GPUs/NPUs (32 in this 2-machine example) |
| 112 | + |
| 113 | +export HCCL_IF_BASE_PORT=43432 # HCCL communication base port |
| 114 | + |
| 115 | +for (( i=0; i<$LOCAL_NODES; i++ )) |
| 116 | +do |
| 117 | + PORT=$((START_PORT + i)) |
| 118 | + DEVICE=$((START_DEVICE + i)) |
| 119 | + LOG_FILE="$LOG_DIR/node_$i.log" |
| 120 | + ./xllm/build/xllm/core/server/xllm \ |
| 121 | + --model $MODEL_PATH \ |
| 122 | + --host $LOCAL_HOST \ |
| 123 | + --port $PORT \ |
| 124 | + --devices="npu:$DEVICE" \ |
| 125 | + --master_node_addr=$MASTER_NODE_ADDR \ |
| 126 | + --nnodes=$NNODES \ |
| 127 | + --max_memory_utilization=0.86 \ |
| 128 | + --max_tokens_per_batch=40000 \ |
| 129 | + --max_seqs_per_batch=256 \ |
| 130 | + --enable_mla=true \ |
| 131 | + --block_size=128 \ |
| 132 | + --enable_prefix_cache=false \ |
| 133 | + --enable_chunked_prefill=false \ |
| 134 | + --communication_backend="hccl" \ |
| 135 | + --enable_schedule_overlap=true \ |
| 136 | + --rank_tablefile=./ranktable_2s_32p.json \ |
| 137 | + --node_rank=$((i + LOCAL_NODES)) & |
| 138 | +done |
| 139 | +``` |
| 140 | +This example uses 2 machines. You can set the total number of GPUs/NPUs via `--nnodes`, where `--node_rank` specifies the global rank ID for each node. |
| 141 | +The `--rank_tablefile=./ranktable_2s_32p.json`parameter points to the configuration file required for establishing the NPU communication domain. For instructions on generating this file, refer to [Ranktable Generation](https://gitee.com/mindspore/models/blob/master/utils/hccl_tools/README.md). |
0 commit comments