File tree Expand file tree Collapse file tree 4 files changed +9
-7
lines changed Expand file tree Collapse file tree 4 files changed +9
-7
lines changed Original file line number Diff line number Diff line change 99import threading
1010import time
1111from contextlib import contextmanager
12- from dataclasses import is_dataclass , field , replace
12+ from dataclasses import is_dataclass , replace
1313from datetime import datetime
1414from enum import IntEnum
1515from functools import lru_cache
@@ -206,7 +206,7 @@ class VllmConfig:
206206 """The configurations for event publishing."""
207207 ec_transfer_config : ECTransferConfig | None = None
208208 """The configurations for distributed EC cache transfer."""
209- fault_tolerance_config : FaultToleranceConfig = field (
209+ fault_tolerance_config : FaultToleranceConfig = Field (
210210 default_factory = FaultToleranceConfig
211211 )
212212 """The configurations for fault tolerance."""
Original file line number Diff line number Diff line change @@ -346,9 +346,7 @@ def __init__(
346346 # processes through the CPU.
347347 with suppress_stdout ():
348348 if not enable_fault_tolerance :
349- cpu_group = torch .distributed .new_group (
350- ranks , backend = "gloo"
351- )
349+ cpu_group = torch .distributed .new_group (ranks , backend = "gloo" )
352350 else :
353351 cpu_group = torch .distributed .new_group (
354352 ranks , backend = "gloo" , timeout = gloo_comm_timeout
@@ -1151,6 +1149,7 @@ def get_pcp_group() -> GroupCoordinator:
11511149 assert _PCP is not None , "prefill context parallel group is not initialized"
11521150 return _PCP
11531151
1152+
11541153def get_all_model_groups () -> list [GroupCoordinator ]:
11551154 group_list = []
11561155 global _TP
@@ -1179,6 +1178,7 @@ def get_all_model_groups() -> list[GroupCoordinator]:
11791178
11801179 return group_list
11811180
1181+
11821182@contextmanager
11831183def graph_capture (device : torch .device ):
11841184 """
Original file line number Diff line number Diff line change @@ -215,7 +215,7 @@ def close(self):
215215
216216 def start_engine_core_monitor (self ):
217217 sentinels = [proc .sentinel for proc in self .processes ]
218- while self . processes :
218+ while sentinels :
219219 died = multiprocessing .connection .wait (sentinels )
220220 for sentinel in died :
221221 died_proc = next (
@@ -239,7 +239,7 @@ def start_engine_core_monitor(self):
239239 sentinels .remove (sentinel )
240240 logger .error (
241241 "Engine core proc %s died unexpectedly" ,
242- died_proc ,
242+ died_proc . name ,
243243 )
244244
245245 def join_first (self ):
Original file line number Diff line number Diff line change @@ -174,6 +174,7 @@ def _abort_nccl_comm(group: GroupCoordinator):
174174 if group .device_communicator is not None :
175175 device_comm = cast (CudaCommunicator , group .device_communicator )
176176 nccl_comm = device_comm .pynccl_comm
177+ assert nccl_comm is not None
177178 nccl_comm .nccl_abort_comm ()
178179
179180 def _abort_process_group (group : GroupCoordinator ):
@@ -223,6 +224,7 @@ def _set_device_communicator_status(self, active: bool):
223224 if group .device_communicator is not None :
224225 device_comm = cast (CudaCommunicator , group .device_communicator )
225226 nccl_comm = device_comm .pynccl_comm
227+ assert nccl_comm is not None
226228 nccl_comm .available = active
227229 nccl_comm .disabled = not active
228230
You can’t perform that action at this time.
0 commit comments