4545from vllm .v1 .core .sched .interface import SchedulerInterface
4646from vllm .v1 .core .sched .output import SchedulerOutput
4747from vllm .v1 .engine import (
48- EngineCoreOutput ,
4948 EngineCoreOutputs ,
5049 EngineCoreRequest ,
5150 EngineCoreRequestType ,
52- FinishReason ,
5351 ReconfigureDistributedRequest ,
5452 ReconfigureRankType ,
5553 UtilityOutput ,
@@ -179,13 +177,16 @@ def run(self) -> None:
179177 self .engine_running = False
180178 except queue .Empty :
181179 pass
182-
183- if self .client_cmd_socket .closed :
184- self .logger ("Client socket closed" , level = "info" )
180+ try :
181+ has_msg , _ , cmd_str = recv_router_dealer_message (
182+ self .client_cmd_socket ,
183+ use_poller = True ,
184+ poll_timeout = poll_timeout_ms ,
185+ )
186+ except zmq .ZMQError :
187+ self .logger ("Socket closed, terminating EngineCoreGuard" , level = "info" )
185188 break
186- has_msg , _ , cmd_str = recv_router_dealer_message (
187- self .client_cmd_socket , use_poller = True , poll_timeout = poll_timeout_ms
188- )
189+
189190 if has_msg :
190191 self .logger ("Received cmd: %s" , cmd_str , level = "info" )
191192 self ._execute_cmd (cmd_str )
@@ -204,7 +205,7 @@ def _execute_worker_method(self, method_name, timeout: int = 5, **kwargs) -> boo
204205 identities = set ()
205206 for tp_rank in range (self .tp_size ):
206207 for pp_rank in range (self .pp_size ):
207- identity = f"{ tp_rank } _{ pp_rank } " .encode ()
208+ identity = f"{ pp_rank } _{ tp_rank } " .encode ()
208209 identities .add (identity )
209210
210211 method_uuid = broadcast_instruction (
@@ -286,26 +287,30 @@ def pause(self, timeout: int = 1, soft_pause: bool = True) -> bool:
286287 success = True
287288 if not soft_pause :
288289 # abort the communicators
289- self ._stop_worker_execution (soft_pause = False , timeout = timeout )
290+ success = self ._stop_worker_execution (soft_pause = False , timeout = timeout )
290291 return success
291292
292- def retry (self , timeout : int = 1 ):
293+ def retry (self , new_stateless_dp_group_port : int , timeout : int = 1 ):
293294 """
294295 Handle the retry instruction from the ClientGuard.
295296 This instruction tells the EngineCore to continue its busy loop
296297 after being suspended due to an exception.
297298 """
298299 start_time = time .monotonic ()
299300
300- success = self ._execute_worker_method ("restart_worker " , timeout = timeout )
301+ success = self ._execute_worker_method ("restore_worker " , timeout = timeout )
301302 if not success :
302303 return success
303304
304305 if self .dp_size > 1 :
305306 # If the Gloo communication times out
306307 # the data parallel group (dp_group) needs to be reinitialized
307308 command = "reinit_dp_group_on_fault_tolerance"
308- self .cmd_q .put (serialize_method_call (command ))
309+ self .cmd_q .put (
310+ serialize_method_call (
311+ command , new_stateless_dp_group_port = new_stateless_dp_group_port
312+ )
313+ )
309314 else :
310315 self .cmd_q .put (None )
311316
@@ -1473,21 +1478,6 @@ def process_output_sockets(
14731478 # Limit the number of buffers to reuse.
14741479 reuse_buffers .append (buffer )
14751480
1476- def engine_finish_requests (self ):
1477- assert isinstance (self .scheduler , V1Scheduler )
1478- engine_finish_outputs = EngineCoreOutputs ()
1479- engine_finish_outputs .engine_index = self .engine_index
1480- for request_id in list (self .scheduler .requests .keys ()):
1481- self .scheduler .finish_requests (request_id , RequestStatus .FINISHED_ABORTED )
1482- engine_finish_outputs .outputs .append (
1483- EngineCoreOutput (
1484- request_id = request_id ,
1485- finish_reason = FinishReason .ABORT ,
1486- new_token_ids = [],
1487- )
1488- )
1489- self .output_queue .put ((0 , engine_finish_outputs ))
1490-
14911481 def shutdown (self ):
14921482 super ().shutdown ()
14931483 if self .vllm_config .fault_tolerance_config .enable_fault_tolerance :
@@ -1549,7 +1539,6 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
15491539 self .dp_rank = dp_rank
15501540 self .dp_group = vllm_config .parallel_config .stateless_init_dp_group (
15511541 vllm_config .fault_tolerance_config .gloo_comm_timeout ,
1552- vllm_config .fault_tolerance_config .enable_fault_tolerance ,
15531542 )
15541543
15551544 def shutdown (self ):
@@ -1658,12 +1647,13 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
16581647
16591648 return ParallelConfig .has_unfinished_dp (self .dp_group , local_unfinished )
16601649
1661- def reinit_dp_group_on_fault_tolerance (self ):
1650+ def reinit_dp_group_on_fault_tolerance (self , new_stateless_dp_group_port : int ):
16621651 stateless_destroy_torch_distributed_process_group (self .dp_group )
16631652 self .dp_group = self .vllm_config .parallel_config .stateless_init_dp_group (
16641653 self .vllm_config .fault_tolerance_config .gloo_comm_timeout ,
1665- self . vllm_config . fault_tolerance_config . enable_fault_tolerance ,
1654+ new_stateless_dp_group_port ,
16661655 )
1656+ self .step_counter = 0
16671657
16681658 def reinitialize_distributed (
16691659 self , reconfig_request : ReconfigureDistributedRequest
0 commit comments