Skip to content

Do the device_map more automaticlly for multi-GPUs #472

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 7 additions & 38 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,6 @@
from transformers import AutoModel


def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus

# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
# 本文件来源于https://github.com/THUDM/ChatGLM-6B/blob/main/utils.py
# 仅此处做少许修改以支持ChatGLM2
device_map = {
'transformer.embedding.word_embeddings': 0,
'transformer.encoder.final_layernorm': 0,
'transformer.output_layer': 0,
'transformer.rotary_pos_emb': 0,
'lm_head': 0
}

used = 2
gpu_target = 0
for i in range(num_trans_layers):
if used >= per_gpu_layers:
gpu_target += 1
used = 0
assert gpu_target < num_gpus
device_map[f'transformer.encoder.layers.{i}'] = gpu_target
used += 1

return device_map


def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
if num_gpus < 2 and device_map is None:
Expand All @@ -52,7 +15,13 @@ def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int =
model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()

if device_map is None:
device_map = auto_configure_device_map(num_gpus)
from accelerate import infer_auto_device_map

device_map = infer_auto_device_map(model, no_split_module_classes=["GLMBlock"])
# e.g. Use max_memory to set the upper limit memory size of each device.
# Huggingface suggest to save some memory of gpu0 for some reasons.
#device_map = infer_auto_device_map(model, max_memory={0: "4GiB", 1: "10GiB", "cpu": "30GiB"}, no_split_module_classes=["GLMBlock"])
#print(device_map)

model = dispatch_model(model, device_map=device_map)

Expand Down