Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Commit a9bc5fd

Browse files
anselmwangmayank31398
authored andcommitted
fix: deadlock in bloom-ds-inference.py (#40)
In `bloom-ds-inference.py`, rank 0 calls `dist.barrier()` while other processes don't. Cause deadlock forever.
1 parent 4a316ba commit a9bc5fd

File tree

1 file changed

+5
-13
lines changed

1 file changed

+5
-13
lines changed

bloom-inference-scripts/bloom-ds-inference.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -151,18 +151,11 @@ def get_checkpoint_files(model_name_or_path):
151151
checkpoints_json = "checkpoints.json"
152152

153153

154-
def write_checkponts_json():
155-
156-
with io.open(checkpoints_json, "w", encoding="utf-8") as f:
157-
158-
# checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin")
159-
checkpoint_files = get_checkpoint_files(model_name)
160-
161-
# print("Checkpoint files:", checkpoint_files)
162-
154+
def write_checkpoints_json():
155+
checkpoint_files = get_checkpoint_files(model_name)
156+
if rank == 0:
163157
data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
164-
165-
json.dump(data, f)
158+
json.dump(data, open(checkpoints_json, "w"))
166159

167160

168161
if args.benchmark:
@@ -181,8 +174,7 @@ def write_checkponts_json():
181174
checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
182175
else:
183176
# for normal bloom repo we need to write the checkpoints config file
184-
if rank == 0:
185-
write_checkponts_json()
177+
write_checkpoints_json()
186178
dist.barrier()
187179

188180
# checkpoints_json=None

0 commit comments

Comments
 (0)