Fix DT and zmq socket closing issues, updated names per feedback and reinitialize dp_group with new port

fangyuchu · fangyuchu · commit 3b203d6cb84f · 2025-11-18T17:36:23.000+08:00
Signed-off-by: fangyuchu &lt;fangyuchu@qq.com&gt;
diff --git a/tests/v1/engine/test_client_guard.py b/tests/v1/engine/test_client_guard.py
@@ -183,7 +183,7 @@ def test_shutdown_guard():
 @pytest.mark.asyncio
 async def test_handle_fault_async():
     engine_exception_q: asyncio.Queue[FaultInfo] = asyncio.Queue()
-    engine_status_dict = create_test_thread_safe_dict({1: "Unhealthy"})
+    engine_status_dict = create_test_thread_safe_dict({0: "Unhealthy"})
     guard = create_client_guard(engine_exception_q, engine_status_dict)
 
     time.sleep(0.1)
@@ -208,7 +208,7 @@ def response_cmd(cmd_socket):
         nonlocal uuid
         while uuid is None:
             time.sleep(0.1)
-        execute_result = {"engine_index": 1, "success": True, "method_uuid": uuid}
+        execute_result = {"engine_index": 0, "success": True, "method_uuid": uuid}
         cmd_socket.send_multipart([b"", json.dumps(execute_result).encode("utf-8")])
 
     threading.Thread(target=receive_cmd, args=(cmd_socket,)).start()
@@ -217,6 +217,6 @@ def response_cmd(cmd_socket):
     result = await guard.handle_fault("retry", 3)
 
     assert result is True
-    assert engine_status_dict[1] == "Healthy"
+    assert engine_status_dict[0] == "Healthy"
 
     guard.shutdown_guard()
diff --git a/tests/v1/engine/test_engine_core_guard.py b/tests/v1/engine/test_engine_core_guard.py
@@ -38,6 +38,7 @@ def create_engine_core_guard(
         guard_identity=GUARD_IDENTITY,
         tp_size=1,
         pp_size=1,
+        dp_size=1,
     )
 
 
@@ -101,6 +102,8 @@ def mock_worker_receiver(cmd_socket):
     param = {"timeout": 3}
     if instruction == "pause":
         param["soft_pause"] = True
+    elif instruction == "retry":
+        param["new_stateless_dp_group_port"] = 23456
     serial_instruction = serialize_method_call(instruction, **param)
     client_socket.send_multipart(
         [GUARD_IDENTITY, b"", serial_instruction.encode("utf-8")]
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -339,8 +339,8 @@ def get_next_dp_init_port(self) -> int:
 
     def stateless_init_dp_group(
         self,
-        gloo_comm_timeout: int = 30,
-        enable_fault_tolerance: bool = False,
+        gloo_comm_timeout: int | None = None,
+        dp_init_port: int | None = None,
     ) -> ProcessGroup:
         # NOTE: In high-concurrency scenarios multiple processes
         # can pick the same (currently free) port through a race
@@ -357,23 +357,25 @@ def stateless_init_dp_group(
 
         max_retries = 5
         last_exc: Exception | None = None
+        if dp_init_port is None:
+            dp_init_port = self.get_next_dp_init_port()
         for _ in range(max_retries):
             try:
                 # use gloo since the engine process might not have cuda device
                 return stateless_init_torch_distributed_process_group(
                     self.data_parallel_master_ip,
-                    self.get_next_dp_init_port(),
+                    dp_init_port,
                     self.data_parallel_rank,
                     self.data_parallel_size,
                     backend=current_platform.dist_backend,
                     gloo_comm_timeout=gloo_comm_timeout,
-                    enable_fault_tolerance=enable_fault_tolerance,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
                 if "EADDRINUSE" in str(e):
                     logger.warning("Address already in use. Retrying with a new port.")
                     last_exc = e
+                    dp_init_port = self.get_next_dp_init_port()
                     continue  # try again with a new port
                 raise e
 
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
@@ -463,8 +463,7 @@ def stateless_init_torch_distributed_process_group(
     rank: int,
     world_size: int,
     backend: str,
-    gloo_comm_timeout: int,
-    enable_fault_tolerance: bool = False,
+    gloo_comm_timeout: int | None,
 ) -> ProcessGroup:
     """
     A replacement for `torch.distributed.init_process_group` that does not
@@ -499,10 +498,11 @@ def stateless_init_torch_distributed_process_group(
     """
     init_method = get_tcp_uri(host, port)
     backend = Backend(backend)  # it is basically string
-    if enable_fault_tolerance:
-        timeout = timedelta(seconds=gloo_comm_timeout)
-    else:
+
+    if gloo_comm_timeout is None:
         timeout = _get_default_timeout(backend)
+    else:
+        timeout = timedelta(seconds=gloo_comm_timeout)
 
     store, rank, world_size = next(
         rendezvous(init_method, rank, world_size, timeout=timeout)
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
@@ -68,8 +68,8 @@ def get_grammar_bitmask(
     def preempt_request(
         self,
         scheduled_timestamp: float | None = None,
-        preempted_req: Request | None = None,
-    ) -> Request:
+        preempted_req: Optional["Request"] = None,
+    ) -> "Request":
         """
         Preempt a running request and move it back to the waiting queue.
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -45,11 +45,9 @@
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
-    EngineCoreOutput,
     EngineCoreOutputs,
     EngineCoreRequest,
     EngineCoreRequestType,
-    FinishReason,
     ReconfigureDistributedRequest,
     ReconfigureRankType,
     UtilityOutput,
@@ -179,13 +177,16 @@ def run(self) -> None:
                 self.engine_running = False
             except queue.Empty:
                 pass
-
-            if self.client_cmd_socket.closed:
-                self.logger("Client socket closed", level="info")
+            try:
+                has_msg, _, cmd_str = recv_router_dealer_message(
+                    self.client_cmd_socket,
+                    use_poller=True,
+                    poll_timeout=poll_timeout_ms,
+                )
+            except zmq.ZMQError:
+                self.logger("Socket closed, terminating EngineCoreGuard", level="info")
                 break
-            has_msg, _, cmd_str = recv_router_dealer_message(
-                self.client_cmd_socket, use_poller=True, poll_timeout=poll_timeout_ms
-            )
+
             if has_msg:
                 self.logger("Received cmd: %s", cmd_str, level="info")
                 self._execute_cmd(cmd_str)
@@ -204,7 +205,7 @@ def _execute_worker_method(self, method_name, timeout: int = 5, **kwargs) -> boo
         identities = set()
         for tp_rank in range(self.tp_size):
             for pp_rank in range(self.pp_size):
-                identity = f"{tp_rank}_{pp_rank}".encode()
+                identity = f"{pp_rank}_{tp_rank}".encode()
                 identities.add(identity)
 
         method_uuid = broadcast_instruction(
@@ -286,26 +287,30 @@ def pause(self, timeout: int = 1, soft_pause: bool = True) -> bool:
             success = True
             if not soft_pause:
                 # abort the communicators
-                self._stop_worker_execution(soft_pause=False, timeout=timeout)
+                success = self._stop_worker_execution(soft_pause=False, timeout=timeout)
         return success
 
-    def retry(self, timeout: int = 1):
+    def retry(self, new_stateless_dp_group_port: int, timeout: int = 1):
         """
         Handle the retry instruction from the ClientGuard.
         This instruction tells the EngineCore to continue its busy loop
         after being suspended due to an exception.
         """
         start_time = time.monotonic()
 
-        success = self._execute_worker_method("restart_worker", timeout=timeout)
+        success = self._execute_worker_method("restore_worker", timeout=timeout)
         if not success:
             return success
 
         if self.dp_size > 1:
             # If the Gloo communication times out
             # the data parallel group (dp_group) needs to be reinitialized
             command = "reinit_dp_group_on_fault_tolerance"
-            self.cmd_q.put(serialize_method_call(command))
+            self.cmd_q.put(
+                serialize_method_call(
+                    command, new_stateless_dp_group_port=new_stateless_dp_group_port
+                )
+            )
         else:
             self.cmd_q.put(None)
 
@@ -1473,21 +1478,6 @@ def process_output_sockets(
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
-    def engine_finish_requests(self):
-        assert isinstance(self.scheduler, V1Scheduler)
-        engine_finish_outputs = EngineCoreOutputs()
-        engine_finish_outputs.engine_index = self.engine_index
-        for request_id in list(self.scheduler.requests.keys()):
-            self.scheduler.finish_requests(request_id, RequestStatus.FINISHED_ABORTED)
-            engine_finish_outputs.outputs.append(
-                EngineCoreOutput(
-                    request_id=request_id,
-                    finish_reason=FinishReason.ABORT,
-                    new_token_ids=[],
-                )
-            )
-        self.output_queue.put((0, engine_finish_outputs))
-
     def shutdown(self):
         super().shutdown()
         if self.vllm_config.fault_tolerance_config.enable_fault_tolerance:
@@ -1549,7 +1539,6 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
         self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group(
             vllm_config.fault_tolerance_config.gloo_comm_timeout,
-            vllm_config.fault_tolerance_config.enable_fault_tolerance,
         )
 
     def shutdown(self):
@@ -1658,12 +1647,13 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
         return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished)
 
-    def reinit_dp_group_on_fault_tolerance(self):
+    def reinit_dp_group_on_fault_tolerance(self, new_stateless_dp_group_port: int):
         stateless_destroy_torch_distributed_process_group(self.dp_group)
         self.dp_group = self.vllm_config.parallel_config.stateless_init_dp_group(
             self.vllm_config.fault_tolerance_config.gloo_comm_timeout,
-            self.vllm_config.fault_tolerance_config.enable_fault_tolerance,
+            new_stateless_dp_group_port,
         )
+        self.step_counter = 0
 
     def reinitialize_distributed(
         self, reconfig_request: ReconfigureDistributedRequest
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -431,31 +431,35 @@ def fault_receiver(self):
         engine_core component. It is designed to run continuously to ensure no critical
         error information from the engine core is missed.
         """
-        while True:
-            _, sender_identity, message = recv_router_dealer_message(
-                self.fault_receiver_socket
-            )
-            if self.client_guard_dead:
-                self.logger("client guard dead, stop receiving fault")
-                break
-            assert message is not None, (
-                "message should not be None at fault tolerance scenario"
-            )
+        while not self.client_guard_dead:
+            try:
+                _, sender_identity, message = recv_router_dealer_message(
+                    self.fault_receiver_socket
+                )
+                assert message is not None, (
+                    "message should not be None at fault tolerance scenario"
+                )
 
-            fault_info = FaultInfo.from_json(message)
-            self.engine_exception_q.put_nowait(fault_info)
-            engine_status = "Dead" if "dead" in fault_info.type else "Unhealthy"
-            self.engine_status_dict[int(fault_info.engine_id)] = engine_status
-            self.fault_pub_socket.send_string(
-                f"vllm_fault|{json.dumps(self.engine_status_dict.to_dict())}"
-            )
-            # TODO Asynchronous issuance of pause commands and design of engine
-            #  core status
-            # Pause healthy engines on fault.
-            # Pause will be invoked again during fault-tolerance handling,
-            # so it's unnecessary to track whether all engines are currently
-            # paused.
-            self.fault_handler.submit_fault("pause", 5, soft_pause=False)
+                fault_info = FaultInfo.from_json(message)
+                self.engine_exception_q.put_nowait(fault_info)
+                engine_status = "Dead" if "dead" in fault_info.type else "Unhealthy"
+                self.engine_status_dict[int(fault_info.engine_id)] = engine_status
+                self.fault_pub_socket.send_string(
+                    f"vllm_fault|{json.dumps(self.engine_status_dict.to_dict())}"
+                )
+
+                # Pause healthy engines on fault.
+                # Pause will be invoked again during fault-tolerance handling,
+                # so it's unnecessary to track whether all engines are currently
+                # paused.
+                self.fault_handler.submit_fault("pause", 5, soft_pause=False)
+            except zmq.ZMQError:
+                # Socket was closed during polling, exit loop.
+                self.logger(
+                    "Fault receiver socket closed, stopping thread.", level="info"
+                )
+                break
+        self.logger("Fault receiver thread has stopped.")
 
     def shutdown_guard(self):
         self.client_guard_dead = True
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
@@ -27,6 +27,7 @@
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.utils.collection_utils import ThreadSafeDict
 from vllm.utils.network_utils import (
+    get_open_port,
     get_open_zmq_ipc_path,
     make_zmq_socket,
     recv_router_dealer_message,
@@ -36,7 +37,7 @@
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.exceptions import FaultInfo
 from vllm.v1.executor import Executor
-from vllm.v1.serial_utils import serialize_method_call
+from vllm.v1.serial_utils import run_method, serialize_method_call
 from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
 
 if TYPE_CHECKING:
@@ -1399,40 +1400,44 @@ async def _dispatcher(self):
                 if fut:
                     fut.set_exception(e)
 
-    async def _handle_fault_internal(
-        self, instruction: str, timeout: int, **kwargs
-    ) -> bool:
-        if instruction == "retry" and "Dead" in self.engine_status_dict.values():
+    def retry(self, **kwargs):
+        if "Dead" in self.engine_status_dict.values():
             self.logger(
                 "engine_core dead unexpectedly, retry is impossible,"
                 "shutdown will be performed",
                 level="info",
             )
-            return False
+            return False, set(), kwargs
+
+        target_engines = set(self.engine_identity_to_index.keys())
+        kwargs["new_stateless_dp_group_port"] = get_open_port()
+        return True, target_engines, kwargs
+
+    def pause(self, **kwargs):
+        self.logger(
+            "Pause operation is best-effort only. Due to the complexity of "
+            "collective communications (e.g., timing dependencies and "
+            "synchronization barriers), pausing may not always succeed. If "
+            "the process remains unresponsive or collective operations "
+            "cannot be interrupted, consider shutting down and restarting "
+            "the instance.",
+            level="warning",
+        )
 
-        if instruction == "pause":
-            logger.warning(
-                "Pause operation is best-effort only. Due to the complexity of "
-                "collective communications (e.g., timing dependencies and "
-                "synchronization barriers), pausing may not always succeed. If "
-                "the process remains unresponsive or collective operations "
-                "cannot be interrupted, consider shutting down and restarting "
-                "the instance."
-            )
+        alive_engines = {
+            identity
+            for identity, index in self.engine_identity_to_index.items()
+            if self.engine_status_dict.get(index) != "Dead"
+        }
+        return True, alive_engines, kwargs
 
-            dead_engine_indices = {
-                index
-                for index, status in self.engine_status_dict.items()
-                if status == "Dead"
-            }
-
-            target_engines = {
-                identity
-                for identity, index in self.engine_identity_to_index.items()
-                if index not in dead_engine_indices
-            }
-        else:
-            target_engines = set(self.engine_identity_to_index.keys())
+    async def _handle_fault_internal(
+        self, instruction: str, timeout: int, **kwargs
+    ) -> bool:
+        success, target_engines, kwargs = run_method(self, instruction, (), kwargs)
+
+        if not success:
+            return False
 
         if timeout is not None:
             kwargs["timeout"] = timeout
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py