InternLM
diff --git a/‎lmdeploy/serve/async_engine.py‎
Lines changed: 57 additions & 5 deletions b/‎lmdeploy/serve/async_engine.py‎
Lines changed: 57 additions & 5 deletions
@@ -32,6 +32,60 @@
 logger = get_logger('lmdeploy')
 
 
+def _merge_message_content(msg: Dict) -> Dict:
+    """Merge multimodal content blocks and ensure content field exists.
+
+    This function normalizes message content to match vLLM's behavior:
+    1. Missing content field -> add content=None
+    2. None content -> keep as None
+    3. String content -> return as-is
+    4. List content (multimodal) -> merge all text blocks with newline separator
+
+    Args:
+        msg: A message dict with 'role' and optionally 'content' field
+
+    Returns:
+        A message dict with 'content' field guaranteed to exist
+
+    Note:
+        This implementation is based on vLLM's content processing logic.
+        vLLM uses "\n".join() to merge multiple text blocks from multimodal content.
+
+    References:
+        - vLLM content normalization:
+          https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/chat_utils.py
+          See _parse_chat_message_content() and _parse_chat_message_content_parts()
+        - vLLM text merging logic:
+          text_prompt = "\n".join(texts)
+    """
+    # If content is missing, add it with None value (e.g., assistant with tool_calls only)
+    if 'content' not in msg:
+        result = dict(msg)
+        result['content'] = None
+        return result
+
+    # If content is None, keep it as None
+    if msg['content'] is None:
+        return msg
+
+    # If content is already a string, return as-is
+    if isinstance(msg['content'], str):
+        return msg
+
+    # If content is a list, merge all text blocks into a single string
+    # This matches vLLM's behavior: text_prompt = "\n".join(texts)
+    content_parts = []
+    for block in msg['content']:
+        if isinstance(block, dict) and block.get('type') == 'text':
+            content_parts.append(block.get('text', ''))
+    merged_content = '\n'.join(content_parts)
+
+    # Preserve all other fields in the message (e.g., tool_calls)
+    result = dict(msg)
+    result['content'] = merged_content
+    return result
+
+
 @dataclasses.dataclass
 class GenOut:
     """Pack all response information together."""
@@ -609,11 +663,9 @@ async def _get_prompt_input(self,
         # Change multimodal data to openai text messages, i.e.,
         # [{'role': 'user', 'content': [{'type': 'text', 'text': 'hi'}]}] ->
         # [{'role': 'user', 'content': 'hi']
-        if isinstance(prompt, list) and any(isinstance(msg['content'], list) for msg in prompt):
-            prompt = [
-                msg if isinstance(msg['content'], str) else dict(role=msg['role'], content=msg['content'][0]['text'])
-                for msg in prompt
-            ]
+        # Also ensure all messages have 'content' field (set to None if missing, e.g., assistant with tool_calls)
+        if isinstance(prompt, list):
+            prompt = [_merge_message_content(msg) for msg in prompt]
         if do_preprocess:
             # use adapter's chat template if possible
             chat_template = self.chat_template