Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion torchrec/distributed/test_utils/multi_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,16 @@ def __init__(
) -> None:
super().__init__(methodName)

# In CUDA 12.8 we're seeing hangs from using forkserver, so we're
# switching to spawn.
# AMD's HIP runtime doesn't seem to work with forkserver; hipMalloc will fail
# Therefore we use spawn for HIP runtime until AMD fixes the issue
self._mp_init_mode: str = mp_init_mode if torch.version.hip is None else "spawn"
if (
torch.version.cuda is not None and torch.version.cuda >= "12.8"
) or torch.version.hip is not None:
self._mp_init_mode: str = "spawn"
else:
self._mp_init_mode: str = mp_init_mode
logging.info(f"Using {self._mp_init_mode} for multiprocessing")

@seed_and_log
Expand Down
Loading