Skip to content

Commit bb2628d

Browse files
authored
docs: add multi-machine deployment documentation. (#278)
1 parent 195fea1 commit bb2628d

File tree

2 files changed

+283
-0
lines changed

2 files changed

+283
-0
lines changed
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Multi-Node Deployment
2+
This example demonstrates how to launch a 32-GPU (NPU) deployment across 2 machines.
3+
Launching Services on the First Machine:
4+
```shell
5+
bash start_deepseek_machine_1.sh
6+
```
7+
8+
The start_deepseek_machine_1.sh script is as follows:
9+
```bash title="start_deepseek_machine_1.sh" linenums="1"
10+
# 1. Environment variable setup
11+
export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
12+
export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
13+
export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU version PyTorch path
14+
export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch installation path
15+
export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch path
16+
export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # Add NPU library path
17+
18+
# 2. Load npu environment
19+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
20+
source /usr/local/Ascend/nnal/atb/set_env.sh
21+
export ASDOPS_LOG_TO_STDOUT=1
22+
export ASDOPS_LOG_LEVEL=ERROR
23+
export ASDOPS_LOG_TO_FILE=1
24+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
25+
export NPU_MEMORY_FRACTION=0.98
26+
export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3
27+
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
28+
export OMP_NUM_THREADS=12
29+
export HCCL_CONNECT_TIMEOUT=7200
30+
export INF_NAN_MODE_ENABLE=0
31+
32+
# 3. Clean up old logs
33+
\rm -rf core.*
34+
35+
# 4. Start distributed service
36+
MODEL_PATH="/path/to/your/DeepSeek-R1" # Model path
37+
MASTER_NODE_ADDR="123.123.123.123:9748" # Master node address (must be globally consistent)
38+
LOCAL_HOST=123.123.123.123 # Local IP for service launch
39+
START_PORT=18000 # Service starting port
40+
START_DEVICE=0 # Starting NPU logical device number
41+
LOCAL_NODES=16 # Number of local processes (this script launches 16 processes)
42+
NNODES=32 # Total number of GPUs/NPUs (32 in this 2-machine example)
43+
44+
export HCCL_IF_BASE_PORT=43432 # HCCL communication base port
45+
46+
for (( i=0; i<$LOCAL_NODES; i++ ))
47+
do
48+
PORT=$((START_PORT + i))
49+
DEVICE=$((START_DEVICE + i))
50+
LOG_FILE="$LOG_DIR/node_$i.log"
51+
./xllm/build/xllm/core/server/xllm \
52+
--model $MODEL_PATH \
53+
--host $LOCAL_HOST \
54+
--port $PORT \
55+
--devices="npu:$DEVICE" \
56+
--master_node_addr=$MASTER_NODE_ADDR \
57+
--nnodes=$NNODES \
58+
--max_memory_utilization=0.86 \
59+
--max_tokens_per_batch=40000 \
60+
--max_seqs_per_batch=256 \
61+
--enable_mla=true \
62+
--block_size=128 \
63+
--enable_prefix_cache=false \
64+
--enable_chunked_prefill=false \
65+
--communication_backend="hccl" \
66+
--enable_schedule_overlap=true \
67+
--rank_tablefile=./ranktable_2s_32p.json \
68+
--node_rank=$i &
69+
done
70+
```
71+
72+
Launching Services on the Second Machine:
73+
```shell
74+
bash start_deepseek_machine_2.sh
75+
```
76+
77+
The start_deepseek_machine_2.sh script is as follows:
78+
```bash title="start_deepseek_machine_2.sh" linenums="1"
79+
# 1. Environment variable setup
80+
export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
81+
export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
82+
export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU version PyTorch path
83+
export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch installation path
84+
export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch path
85+
export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # Add NPU library path
86+
87+
# 2. Load npu environment
88+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
89+
source /usr/local/Ascend/nnal/atb/set_env.sh
90+
export ASDOPS_LOG_TO_STDOUT=1
91+
export ASDOPS_LOG_LEVEL=ERROR
92+
export ASDOPS_LOG_TO_FILE=1
93+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
94+
export NPU_MEMORY_FRACTION=0.98
95+
export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3
96+
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
97+
export OMP_NUM_THREADS=12
98+
export HCCL_CONNECT_TIMEOUT=7200
99+
export INF_NAN_MODE_ENABLE=0
100+
101+
# 3. Clean up old logs
102+
\rm -rf core.*
103+
104+
# 4. Start distributed service
105+
MODEL_PATH="/path/to/your/DeepSeek-R1" # Model path
106+
MASTER_NODE_ADDR="123.123.123.123:9748" # Master node address (must be globally consistent)
107+
LOCAL_HOST=456.456.456.456 # Local IP for service launch
108+
START_PORT=18000 # Service starting port
109+
START_DEVICE=0 # Starting NPU logical device number
110+
LOCAL_NODES=16 # Number of local processes (this script launches 16 processes)
111+
NNODES=32 # Total number of GPUs/NPUs (32 in this 2-machine example)
112+
113+
export HCCL_IF_BASE_PORT=43432 # HCCL communication base port
114+
115+
for (( i=0; i<$LOCAL_NODES; i++ ))
116+
do
117+
PORT=$((START_PORT + i))
118+
DEVICE=$((START_DEVICE + i))
119+
LOG_FILE="$LOG_DIR/node_$i.log"
120+
./xllm/build/xllm/core/server/xllm \
121+
--model $MODEL_PATH \
122+
--host $LOCAL_HOST \
123+
--port $PORT \
124+
--devices="npu:$DEVICE" \
125+
--master_node_addr=$MASTER_NODE_ADDR \
126+
--nnodes=$NNODES \
127+
--max_memory_utilization=0.86 \
128+
--max_tokens_per_batch=40000 \
129+
--max_seqs_per_batch=256 \
130+
--enable_mla=true \
131+
--block_size=128 \
132+
--enable_prefix_cache=false \
133+
--enable_chunked_prefill=false \
134+
--communication_backend="hccl" \
135+
--enable_schedule_overlap=true \
136+
--rank_tablefile=./ranktable_2s_32p.json \
137+
--node_rank=$((i + LOCAL_NODES)) &
138+
done
139+
```
140+
This example uses 2 machines. You can set the total number of GPUs/NPUs via `--nnodes`, where `--node_rank` specifies the global rank ID for each node.
141+
The `--rank_tablefile=./ranktable_2s_32p.json`parameter points to the configuration file required for establishing the NPU communication domain. For instructions on generating this file, refer to [Ranktable Generation](https://gitee.com/mindspore/models/blob/master/utils/hccl_tools/README.md).
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# 多机部署
2+
该示例为两机32卡启动示例,第一台机器服务:
3+
```shell
4+
bash start_deepseek_machine_1.sh
5+
```
6+
start_deepseek_machine_1.sh 脚本如下:
7+
```bash title="start_deepseek_machine_1.sh" linenums="1"
8+
# 1. 环境变量设置
9+
export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
10+
export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
11+
export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU 版 PyTorch 路径
12+
export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch 安装路径
13+
export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch 路径
14+
export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # 添加 NPU 库路径
15+
16+
# 2. 加载环境
17+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
18+
source /usr/local/Ascend/nnal/atb/set_env.sh
19+
export ASDOPS_LOG_TO_STDOUT=1
20+
export ASDOPS_LOG_LEVEL=ERROR
21+
export ASDOPS_LOG_TO_FILE=1
22+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
23+
export NPU_MEMORY_FRACTION=0.98
24+
export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3
25+
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
26+
export OMP_NUM_THREADS=12
27+
export HCCL_CONNECT_TIMEOUT=7200
28+
export INF_NAN_MODE_ENABLE=0
29+
30+
# 3. 清理旧日志
31+
\rm -rf core.*
32+
33+
# 4. 启动分布式服务
34+
35+
MODEL_PATH="/path/to/your/DeepSeek-R1" # 模型路径
36+
MASTER_NODE_ADDR="123.123.123.123:9748" # Master 节点地址(需全局一致)
37+
LOCAL_HOST=123.123.123.123 # 本机服务启动IP
38+
START_PORT=18000 # 服务起始端口
39+
START_DEVICE=0 # 起始 NPU 逻辑设备号
40+
LOG_DIR="log" # 日志目录
41+
LOCAL_NODES=16 # 单机节点数(当前脚本启动 16 个进程)
42+
NNODES=32 # 总卡数(该示例为2机32卡)
43+
44+
export HCCL_IF_BASE_PORT=43432 # HCCL 通信基础端口
45+
46+
for (( i=0; i<$LOCAL_NODES; i++ ))
47+
do
48+
PORT=$((START_PORT + i))
49+
DEVICE=$((START_DEVICE + i))
50+
LOG_FILE="$LOG_DIR/node_$i.log"
51+
./xllm/build/xllm/core/server/xllm \
52+
--model $MODEL_PATH \
53+
--host $LOCAL_HOST \
54+
--port $PORT \
55+
--devices="npu:$DEVICE" \
56+
--master_node_addr=$MASTER_NODE_ADDR \
57+
--nnodes=$NNODES \
58+
--max_memory_utilization=0.86 \
59+
--max_tokens_per_batch=40000 \
60+
--max_seqs_per_batch=256 \
61+
--enable_mla=true \
62+
--block_size=128 \
63+
--enable_prefix_cache=false \
64+
--enable_chunked_prefill=false \
65+
--communication_backend="hccl" \
66+
--enable_schedule_overlap=true \
67+
--rank_tablefile=./ranktable_2s_32p.json \
68+
--node_rank=$i &
69+
done
70+
```
71+
72+
启动第二台机器服务:
73+
```shell
74+
bash start_deepseek_machine_2.sh
75+
```
76+
start_deepseek_machine_2.sh 脚本如下:
77+
```bash title="start_deepseek_machine_2.sh" linenums="1"
78+
# 1. 环境变量设置
79+
export PYTHON_INCLUDE_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
80+
export PYTHON_LIB_PATH="$(python3 -c 'from sysconfig import get_paths; print(get_paths()["include"])')"
81+
export PYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/ # NPU 版 PyTorch 路径
82+
export PYTORCH_INSTALL_PATH="$(python3 -c 'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')" # PyTorch 安装路径
83+
export LIBTORCH_ROOT="$PYTORCH_INSTALL_PATH" # LibTorch 路径
84+
export LD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH # 添加 NPU 库路径
85+
86+
# 2. 加载环境
87+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
88+
source /usr/local/Ascend/nnal/atb/set_env.sh
89+
export ASDOPS_LOG_TO_STDOUT=1
90+
export ASDOPS_LOG_LEVEL=ERROR
91+
export ASDOPS_LOG_TO_FILE=1
92+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
93+
export NPU_MEMORY_FRACTION=0.98
94+
export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3
95+
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
96+
export OMP_NUM_THREADS=12
97+
export HCCL_CONNECT_TIMEOUT=7200
98+
export INF_NAN_MODE_ENABLE=0
99+
100+
# 3. 清理旧日志
101+
\rm -rf core.*
102+
103+
# 4. 启动分布式服务
104+
105+
MODEL_PATH="/path/to/your/DeepSeek-R1" # 模型路径
106+
MASTER_NODE_ADDR="123.123.123.123:9748" # Master 节点地址(需全局一致)
107+
LOCAL_HOST=456.456.456.456 # 本机服务启动IP
108+
START_PORT=18000 # 服务起始端口
109+
START_DEVICE=0 # 起始 NPU 逻辑设备号
110+
LOG_DIR="log" # 日志目录
111+
LOCAL_NODES=16 # 单机节点数(当前脚本启动 16 个进程)
112+
NNODES=32 # 总卡数(该示例为2机32卡)
113+
114+
export HCCL_IF_BASE_PORT=43432 # HCCL 通信基础端口
115+
116+
for (( i=0; i<$LOCAL_NODES; i++ ))
117+
do
118+
PORT=$((START_PORT + i))
119+
DEVICE=$((START_DEVICE + i))
120+
LOG_FILE="$LOG_DIR/node_$i.log"
121+
./xllm/build/xllm/core/server/xllm \
122+
--model $MODEL_PATH \
123+
--host $LOCAL_HOST \
124+
--port $PORT \
125+
--devices="npu:$DEVICE" \
126+
--master_node_addr=$MASTER_NODE_ADDR \
127+
--nnodes=$NNODES \
128+
--max_memory_utilization=0.86 \
129+
--max_tokens_per_batch=40000 \
130+
--max_seqs_per_batch=256 \
131+
--enable_mla=true \
132+
--block_size=128 \
133+
--enable_prefix_cache=false \
134+
--enable_chunked_prefill=false \
135+
--communication_backend="hccl" \
136+
--enable_schedule_overlap=true \
137+
--rank_tablefile=./ranktable_2s_32p.json \
138+
--node_rank=$((i + LOCAL_NODES)) &
139+
done
140+
```
141+
这里使用了两台机器,可以通过 `--nnodes`设置总卡数,`--node_rank`为全局rank id。
142+
`--rank_tablefile=./ranktable_2s_32p.json`为构建npu通信域所需文件,可参考[ranktable 生成](https://gitee.com/mindspore/models/blob/master/utils/hccl_tools/README.md)生成。

0 commit comments

Comments
 (0)