Improve documentation and logging in API server

fangyuchu · fangyuchu · commit 9f496ca85201 · 2025-11-12T14:33:48.000+08:00
Signed-off-by: fangyuchu &lt;fangyuchu@qq.com&gt;
diff --git a/tests/v1/engine/test_client_guard.py b/tests/v1/engine/test_client_guard.py
@@ -219,4 +219,5 @@ def response_cmd(cmd_socket):
     assert result is True
     assert engine_status_dict[0] == "Healthy"
 
+    cmd_socket.close()
     guard.shutdown_guard()
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
@@ -15,7 +15,7 @@
 from collections.abc import AsyncGenerator
 from typing import Any
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
 import vllm.envs as envs
@@ -57,23 +57,38 @@ async def generate(request: Request) -> Response:
 
 
 @app.post("/fault_tolerance/apply")
-async def send_fault_tolerance_instruction(request: Request) -> Response:
-    """Generate completion for the request.
+async def process_fault_tolerance_instruction(request: Request) -> Response:
+    """Apply fault tolerance instructions to the engine.
+
+    This endpoint handles fault recovery operations such as retrying operations.
 
     The request should be a JSON object with the following fields:
-    - prompt: the prompt to use for the generation.
-    - stream: whether to stream the results or not.
-    - other fields: the sampling parameters (See `SamplingParams` for details).
+    - fault_tolerance_instruction: The name of fault tolerance method.
+    - fault_tolerance_timeout: Timeout in seconds for the operation to complete.
+    - fault_tolerance_params: dict, optional. Additional dynamic parameters for
+    the fault tolerance operation.
     """
     request_dict = await request.json()
 
     fault_tolerance_instruction = request_dict.get("fault_tolerance_instruction")
     fault_tolerance_timeout = request_dict.get("fault_tolerance_timeout")
-    kwargs = request_dict.get("kwargs", {})
+    kwargs = request_dict.get("fault_tolerance_params", {})
     assert engine is not None
-    return await engine.handle_fault(
+    success = await engine.handle_fault(
         fault_tolerance_instruction, fault_tolerance_timeout, **kwargs
     )
+    if success:
+        return JSONResponse(
+            status_code=200,
+            content={"message": "Instruction executed successfully."},
+        )
+
+    logger.error("Fault tolerance operation failed. Shutting down the engine.")
+    engine.shutdown()
+    raise HTTPException(
+        status_code=400,
+        detail="Instruction execution failed.",
+    )
 
 
 @app.get("/fault_tolerance/status")
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1232,7 +1232,7 @@ async def is_scaling_elastic_ep(raw_request: Request):
         HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
     },
 )
-async def send_fault_tolerance_instruction(raw_request: Request):
+async def process_fault_tolerance_instruction(raw_request: Request):
     try:
         body = await raw_request.json()
     except json.JSONDecodeError as e:
@@ -1247,50 +1247,51 @@ async def send_fault_tolerance_instruction(raw_request: Request):
     if fault_tolerance_instruction is None or fault_tolerance_timeout is None:
         raise HTTPException(
             status_code=400,
-            detail="fault_tolerance_instruction and"
-            " fault_tolerance_timeout is required",
+            detail="Both 'fault_tolerance_instruction' and "
+            "'fault_tolerance_timeout' are required.",
         )
 
     if not isinstance(fault_tolerance_instruction, str):
         raise HTTPException(
-            status_code=400, detail="fault_tolerance_instruction must be a str"
+            status_code=400, detail="'fault_tolerance_instruction' must be a string."
         )
-    # Currently, only two types of instructions are supported: [pause, retry].
-    # Additional descaling instructions will be supported in future updates.
+    # Supported instructions: ["pause", "retry"].
+    # More instruction types may be added in future updates.
     elif fault_tolerance_instruction not in ["pause", "retry"]:
         raise HTTPException(
-            status_code=400, detail="not a valid fault_tolerance_instruction"
+            status_code=400, detail="Invalid 'fault_tolerance_instruction' value."
         )
 
     if not isinstance(fault_tolerance_timeout, int) or fault_tolerance_timeout <= 0:
         raise HTTPException(
-            status_code=400, detail="fault_tolerance_timeout must be a positive integer"
+            status_code=400,
+            detail="'fault_tolerance_timeout' must be a positive integer.",
         )
     try:
-        execute_result = await client.handle_fault(
+        success = await client.handle_fault(
             fault_tolerance_instruction,
             fault_tolerance_timeout,
             **dynamic_fault_tolerance_params,
         )
-        if execute_result:
+        if success:
             return JSONResponse(
                 {
-                    "message": "instruction has been executed successfully",
+                    "message": "Instruction executed successfully.",
                 }
             )
         else:
-            logger.error("Fault tolerance failed, shutdown the app.")
+            logger.error("Fault tolerance failed. Shutting down the application.")
             client.shutdown()
             raise HTTPException(
                 status_code=400,
                 detail="Instruction execution failed.",
             )
 
     except Exception as e:
-        logger.error("Handle fault failed: %s", e)
+        logger.error("Failed to handle fault: %s", e)
         raise HTTPException(
             status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-            detail="Handle fault failed",
+            detail="Failed to handle fault.",
         ) from e