4545from vllm .v1 .core .sched .interface import SchedulerInterface
4646from vllm .v1 .core .sched .output import SchedulerOutput
4747from vllm .v1 .engine import (
48- EngineCoreOutput ,
4948 EngineCoreOutputs ,
5049 EngineCoreRequest ,
5150 EngineCoreRequestType ,
52- FinishReason ,
5351 ReconfigureDistributedRequest ,
5452 ReconfigureRankType ,
5553 UtilityOutput ,
@@ -180,13 +178,16 @@ def run(self) -> None:
180178 self .engine_running = False
181179 except queue .Empty :
182180 pass
183-
184- if self .client_cmd_socket .closed :
185- self .logger ("Client socket closed" , level = "info" )
181+ try :
182+ has_msg , _ , cmd_str = recv_router_dealer_message (
183+ self .client_cmd_socket ,
184+ use_poller = True ,
185+ poll_timeout = poll_timeout_ms ,
186+ )
187+ except zmq .ZMQError :
188+ self .logger ("Socket closed, terminating EngineCoreGuard" , level = "info" )
186189 break
187- has_msg , _ , cmd_str = recv_router_dealer_message (
188- self .client_cmd_socket , use_poller = True , poll_timeout = poll_timeout_ms
189- )
190+
190191 if has_msg :
191192 self .logger ("Received cmd: %s" , cmd_str , level = "info" )
192193 self ._execute_cmd (cmd_str )
@@ -205,7 +206,7 @@ def _execute_worker_method(self, method_name, timeout: int = 5, **kwargs) -> boo
205206 identities = set ()
206207 for tp_rank in range (self .tp_size ):
207208 for pp_rank in range (self .pp_size ):
208- identity = f"{ tp_rank } _{ pp_rank } " .encode ()
209+ identity = f"{ pp_rank } _{ tp_rank } " .encode ()
209210 identities .add (identity )
210211
211212 method_uuid = broadcast_instruction (
@@ -287,26 +288,30 @@ def pause(self, timeout: int = 1, soft_pause: bool = True) -> bool:
287288 success = True
288289 if not soft_pause :
289290 # abort the communicators
290- self ._stop_worker_execution (soft_pause = False , timeout = timeout )
291+ success = self ._stop_worker_execution (soft_pause = False , timeout = timeout )
291292 return success
292293
293- def retry (self , timeout : int = 1 ):
294+ def retry (self , new_stateless_dp_group_port : int , timeout : int = 1 ):
294295 """
295296 Handle the retry instruction from the ClientGuard.
296297 This instruction tells the EngineCore to continue its busy loop
297298 after being suspended due to an exception.
298299 """
299300 start_time = time .monotonic ()
300301
301- success = self ._execute_worker_method ("restart_worker " , timeout = timeout )
302+ success = self ._execute_worker_method ("restore_worker " , timeout = timeout )
302303 if not success :
303304 return success
304305
305306 if self .dp_size > 1 :
306307 # If the Gloo communication times out
307308 # the data parallel group (dp_group) needs to be reinitialized
308309 command = "reinit_dp_group_on_fault_tolerance"
309- self .cmd_q .put (serialize_method_call (command ))
310+ self .cmd_q .put (
311+ serialize_method_call (
312+ command , new_stateless_dp_group_port = new_stateless_dp_group_port
313+ )
314+ )
310315 else :
311316 self .cmd_q .put (None )
312317
@@ -1486,21 +1491,6 @@ def process_output_sockets(
14861491 # Limit the number of buffers to reuse.
14871492 reuse_buffers .append (buffer )
14881493
1489- def engine_finish_requests (self ):
1490- assert isinstance (self .scheduler , V1Scheduler )
1491- engine_finish_outputs = EngineCoreOutputs ()
1492- engine_finish_outputs .engine_index = self .engine_index
1493- for request_id in list (self .scheduler .requests .keys ()):
1494- self .scheduler .finish_requests (request_id , RequestStatus .FINISHED_ABORTED )
1495- engine_finish_outputs .outputs .append (
1496- EngineCoreOutput (
1497- request_id = request_id ,
1498- finish_reason = FinishReason .ABORT ,
1499- new_token_ids = [],
1500- )
1501- )
1502- self .output_queue .put ((0 , engine_finish_outputs ))
1503-
15041494 def shutdown (self ):
15051495 super ().shutdown ()
15061496 if self .vllm_config .fault_tolerance_config .enable_fault_tolerance :
@@ -1562,7 +1552,6 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
15621552 self .dp_rank = dp_rank
15631553 self .dp_group = vllm_config .parallel_config .stateless_init_dp_group (
15641554 vllm_config .fault_tolerance_config .gloo_comm_timeout ,
1565- vllm_config .fault_tolerance_config .enable_fault_tolerance ,
15661555 )
15671556
15681557 def shutdown (self ):
@@ -1671,12 +1660,13 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
16711660
16721661 return ParallelConfig .has_unfinished_dp (self .dp_group , local_unfinished )
16731662
1674- def reinit_dp_group_on_fault_tolerance (self ):
1663+ def reinit_dp_group_on_fault_tolerance (self , new_stateless_dp_group_port : int ):
16751664 stateless_destroy_torch_distributed_process_group (self .dp_group )
16761665 self .dp_group = self .vllm_config .parallel_config .stateless_init_dp_group (
16771666 self .vllm_config .fault_tolerance_config .gloo_comm_timeout ,
1678- self . vllm_config . fault_tolerance_config . enable_fault_tolerance ,
1667+ new_stateless_dp_group_port ,
16791668 )
1669+ self .step_counter = 0
16801670
16811671 def reinitialize_distributed (
16821672 self , reconfig_request : ReconfigureDistributedRequest
0 commit comments