Skip to content

Commit 8edcc4e

Browse files
Guardrails API - add streaming support (#17400)
* fix(initial-commit): adding a way to get the right response type based on the api route * feat(unified_guardrail.py): support streaming guardrails * test: update tests * fix: fix linting errors * test: update tests
1 parent 74ba18d commit 8edcc4e

File tree

21 files changed

+1134
-379
lines changed

21 files changed

+1134
-379
lines changed

litellm/batches/main.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818

1919
import httpx
2020
from openai.types.batch import BatchRequestCounts
21-
from openai.types.batch import Metadata
22-
from openai.types.batch import Metadata as OpenAIBatchMetadata
2321

2422
import litellm
2523
from litellm._logging import verbose_logger

litellm/constants.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@
262262
QDRANT_SCALAR_QUANTILE = float(os.getenv("QDRANT_SCALAR_QUANTILE", 0.99))
263263
QDRANT_VECTOR_SIZE = int(os.getenv("QDRANT_VECTOR_SIZE", 1536))
264264
CACHED_STREAMING_CHUNK_DELAY = float(os.getenv("CACHED_STREAMING_CHUNK_DELAY", 0.02))
265-
AUDIO_SPEECH_CHUNK_SIZE = 8192 # chunk_size for audio speech streaming. Balance between latency and memory usage
265+
AUDIO_SPEECH_CHUNK_SIZE = int(
266+
os.getenv("AUDIO_SPEECH_CHUNK_SIZE", 8192)
267+
) # chunk_size for audio speech streaming. Balance between latency and memory usage
266268
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = int(
267269
os.getenv("MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB", 512)
268270
)
@@ -285,10 +287,16 @@
285287
MAX_LANGFUSE_INITIALIZED_CLIENTS = int(
286288
os.getenv("MAX_LANGFUSE_INITIALIZED_CLIENTS", 50)
287289
)
288-
LOGGING_WORKER_CONCURRENCY = int(os.getenv("LOGGING_WORKER_CONCURRENCY", 100)) # Must be above 0
290+
LOGGING_WORKER_CONCURRENCY = int(
291+
os.getenv("LOGGING_WORKER_CONCURRENCY", 100)
292+
) # Must be above 0
289293
LOGGING_WORKER_MAX_QUEUE_SIZE = int(os.getenv("LOGGING_WORKER_MAX_QUEUE_SIZE", 50_000))
290-
LOGGING_WORKER_MAX_TIME_PER_COROUTINE = float(os.getenv("LOGGING_WORKER_MAX_TIME_PER_COROUTINE", 20.0))
291-
LOGGING_WORKER_CLEAR_PERCENTAGE = int(os.getenv("LOGGING_WORKER_CLEAR_PERCENTAGE", 50)) # Percentage of queue to clear (default: 50%)
294+
LOGGING_WORKER_MAX_TIME_PER_COROUTINE = float(
295+
os.getenv("LOGGING_WORKER_MAX_TIME_PER_COROUTINE", 20.0)
296+
)
297+
LOGGING_WORKER_CLEAR_PERCENTAGE = int(
298+
os.getenv("LOGGING_WORKER_CLEAR_PERCENTAGE", 50)
299+
) # Percentage of queue to clear (default: 50%)
292300
MAX_ITERATIONS_TO_CLEAR_QUEUE = int(os.getenv("MAX_ITERATIONS_TO_CLEAR_QUEUE", 200))
293301
MAX_TIME_TO_CLEAR_QUEUE = float(os.getenv("MAX_TIME_TO_CLEAR_QUEUE", 5.0))
294302
LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS = float(
@@ -866,7 +874,7 @@
866874
"deepseek_r1",
867875
"qwen3",
868876
"twelvelabs",
869-
"openai"
877+
"openai",
870878
]
871879

872880
BEDROCK_EMBEDDING_PROVIDERS_LITERAL = Literal[

litellm/integrations/custom_guardrail.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
GuardrailEventHooks,
2121
LitellmParams,
2222
Mode,
23-
PiiEntityType,
2423
)
2524
from litellm.types.llms.openai import AllMessageValues
2625
from litellm.types.proxy.guardrails.guardrail_hooks.base import GuardrailConfigModel

litellm/litellm_core_utils/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ Core files:
99
- `default_encoding.py`: code for loading the default encoding (tiktoken)
1010
- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name.
1111
- `duration_parser.py`: code for parsing durations - e.g. "1d", "1mo", "10s"
12+
- `api_route_to_call_types.py`: mapping of API routes to their corresponding CallTypes (e.g., `/chat/completions` -> [acompletion, completion])
1213

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Dictionary mapping API routes to their corresponding CallTypes in LiteLLM.
3+
4+
This dictionary maps each API endpoint to the CallTypes that can be used for that route.
5+
Each route can have both async (prefixed with 'a') and sync call types.
6+
"""
7+
8+
from litellm.types.utils import API_ROUTE_TO_CALL_TYPES, CallTypes
9+
10+
11+
def get_call_types_for_route(route: str) -> list:
12+
"""
13+
Get the list of CallTypes for a given API route.
14+
15+
Args:
16+
route: API route path (e.g., "/chat/completions")
17+
18+
Returns:
19+
List of CallTypes for that route, or empty list if route not found
20+
"""
21+
return API_ROUTE_TO_CALL_TYPES.get(route, [])
22+
23+
24+
def get_routes_for_call_type(call_type: CallTypes) -> list:
25+
"""
26+
Get all routes that use a specific CallType.
27+
28+
Args:
29+
call_type: The CallType to search for
30+
31+
Returns:
32+
List of routes that use this CallType
33+
"""
34+
routes = []
35+
for route, types in API_ROUTE_TO_CALL_TYPES.items():
36+
if call_type in types:
37+
routes.append(route)
38+
return routes

litellm/llms/base_llm/guardrail_translation/base_translation.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,17 @@ async def process_output_response(
8484
user_api_key_dict: User API key metadata (passed separately since response doesn't contain it)
8585
"""
8686
pass
87+
88+
async def process_output_streaming_response(
89+
self,
90+
response: Any,
91+
guardrail_to_apply: "CustomGuardrail",
92+
litellm_logging_obj: Optional["LiteLLMLoggingObj"] = None,
93+
user_api_key_dict: Optional["UserAPIKeyAuth"] = None,
94+
) -> Any:
95+
"""
96+
Process output streaming response with guardrails.
97+
98+
Optional to override in subclasses.
99+
"""
100+
return response

litellm/llms/openai/chat/guardrail_translation/handler.py

Lines changed: 82 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,16 @@
1414
This pattern can be replicated for other message formats (e.g., Anthropic).
1515
"""
1616

17-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, cast
17+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
1818

1919
import litellm
2020
from litellm._logging import verbose_proxy_logger
2121
from litellm.llms.base_llm.guardrail_translation.base_translation import BaseTranslation
22-
from litellm.types.utils import Choices
22+
from litellm.types.utils import Choices, StreamingChoices
2323

2424
if TYPE_CHECKING:
2525
from litellm.integrations.custom_guardrail import CustomGuardrail
26-
from litellm.types.utils import ModelResponse
26+
from litellm.types.utils import ModelResponse, ModelResponseStream
2727

2828

2929
class OpenAIChatCompletionsHandler(BaseTranslation):
@@ -241,21 +241,79 @@ async def process_output_response(
241241

242242
return response
243243

244-
def _has_text_content(self, response: "ModelResponse") -> bool:
244+
async def process_output_streaming_response(
245+
self,
246+
response: "ModelResponseStream",
247+
guardrail_to_apply: "CustomGuardrail",
248+
litellm_logging_obj: Optional[Any] = None,
249+
user_api_key_dict: Optional[Any] = None,
250+
) -> Any:
251+
"""
252+
Process output streaming response by applying guardrails to text content.
253+
254+
Args:
255+
response: LiteLLM ModelResponseStream object
256+
guardrail_to_apply: The guardrail instance to apply
257+
litellm_logging_obj: Optional logging object
258+
user_api_key_dict: User API key metadata to pass to guardrails
259+
260+
Returns:
261+
Modified response with guardrail applied to content
262+
263+
Response Format Support:
264+
- String content: choice.message.content = "text here"
265+
- List content: choice.message.content = [{"type": "text", "text": "text here"}, ...]
266+
"""
267+
268+
# Step 0: Check if response has any text content to process
269+
if not self._has_text_content(response):
270+
return response
271+
272+
texts_to_check: List[str] = []
273+
images_to_check: List[str] = []
274+
task_mappings: List[Tuple[int, Optional[int]]] = []
275+
# Track (choice_index, content_index) for each text
276+
277+
# Step 1: Extract all text content and images from response choices
278+
for choice_idx, choice in enumerate(response.choices):
279+
280+
self._extract_output_text_and_images(
281+
choice=choice,
282+
choice_idx=choice_idx,
283+
texts_to_check=texts_to_check,
284+
images_to_check=images_to_check,
285+
task_mappings=task_mappings,
286+
)
287+
288+
def _has_text_content(
289+
self, response: Union["ModelResponse", "ModelResponseStream"]
290+
) -> bool:
245291
"""
246292
Check if response has any text content to process.
247293
248294
Override this method to customize text content detection.
249295
"""
250-
for choice in response.choices:
251-
if isinstance(choice, litellm.Choices):
252-
if choice.message.content and isinstance(choice.message.content, str):
253-
return True
296+
from litellm.types.utils import ModelResponse, ModelResponseStream
297+
298+
if isinstance(response, ModelResponse):
299+
for choice in response.choices:
300+
if isinstance(choice, litellm.Choices):
301+
if choice.message.content and isinstance(
302+
choice.message.content, str
303+
):
304+
return True
305+
elif isinstance(response, ModelResponseStream):
306+
for choice in response.choices:
307+
if isinstance(choice, litellm.Choices):
308+
if choice.message.content and isinstance(
309+
choice.message.content, str
310+
):
311+
return True
254312
return False
255313

256314
def _extract_output_text_and_images(
257315
self,
258-
choice: Any,
316+
choice: Union[Choices, StreamingChoices],
259317
choice_idx: int,
260318
texts_to_check: List[str],
261319
images_to_check: List[str],
@@ -266,21 +324,29 @@ def _extract_output_text_and_images(
266324
267325
Override this method to customize text/image extraction logic.
268326
"""
269-
if not isinstance(choice, litellm.Choices):
270-
return
271-
272327
verbose_proxy_logger.debug(
273328
"OpenAI Chat Completions: Processing choice: %s", choice
274329
)
275330

276-
if choice.message.content and isinstance(choice.message.content, str):
331+
# Determine content source based on choice type
332+
content = None
333+
if isinstance(choice, litellm.Choices):
334+
content = choice.message.content
335+
elif isinstance(choice, litellm.StreamingChoices):
336+
content = choice.delta.content
337+
else:
338+
# Unknown choice type, skip processing
339+
return
340+
341+
# Process content if it exists
342+
if content and isinstance(content, str):
277343
# Simple string content
278-
texts_to_check.append(choice.message.content)
344+
texts_to_check.append(content)
279345
task_mappings.append((choice_idx, None))
280346

281-
elif choice.message.content and isinstance(choice.message.content, list):
347+
elif content and isinstance(content, list):
282348
# List content (e.g., multimodal response)
283-
for content_idx, content_item in enumerate(choice.message.content):
349+
for content_idx, content_item in enumerate(content):
284350
# Extract text
285351
content_text = content_item.get("text")
286352
if content_text:

litellm/model_prices_and_context_window_backup.json

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,71 @@
269269
"supports_response_schema": true,
270270
"supports_vision": true
271271
},
272+
"amazon.nova-2-lite-v1:0": {
273+
"input_cost_per_token": 3e-07,
274+
"litellm_provider": "bedrock_converse",
275+
"max_input_tokens": 1000000,
276+
"max_output_tokens": 64000,
277+
"max_tokens": 64000,
278+
"mode": "chat",
279+
"output_cost_per_token": 2.5e-06,
280+
"supports_function_calling": true,
281+
"supports_pdf_input": true,
282+
"supports_prompt_caching": true,
283+
"supports_reasoning": true,
284+
"supports_response_schema": true,
285+
"supports_video_input": true,
286+
"supports_vision": true
287+
},
288+
"apac.amazon.nova-2-lite-v1:0": {
289+
"input_cost_per_token": 6e-08,
290+
"litellm_provider": "bedrock_converse",
291+
"max_input_tokens": 1000000,
292+
"max_output_tokens": 64000,
293+
"max_tokens": 64000,
294+
"mode": "chat",
295+
"output_cost_per_token": 2.75e-06,
296+
"supports_function_calling": true,
297+
"supports_pdf_input": true,
298+
"supports_prompt_caching": true,
299+
"supports_reasoning": true,
300+
"supports_response_schema": true,
301+
"supports_video_input": true,
302+
"supports_vision": true
303+
},
304+
"eu.amazon.nova-2-lite-v1:0": {
305+
"input_cost_per_token": 6e-08,
306+
"litellm_provider": "bedrock_converse",
307+
"max_input_tokens": 1000000,
308+
"max_output_tokens": 64000,
309+
"max_tokens": 64000,
310+
"mode": "chat",
311+
"output_cost_per_token": 2.75e-06,
312+
"supports_function_calling": true,
313+
"supports_pdf_input": true,
314+
"supports_prompt_caching": true,
315+
"supports_reasoning": true,
316+
"supports_response_schema": true,
317+
"supports_video_input": true,
318+
"supports_vision": true
319+
},
320+
"us.amazon.nova-2-lite-v1:0": {
321+
"input_cost_per_token": 6e-08,
322+
"litellm_provider": "bedrock_converse",
323+
"max_input_tokens": 1000000,
324+
"max_output_tokens": 64000,
325+
"max_tokens": 64000,
326+
"mode": "chat",
327+
"output_cost_per_token": 2.75e-06,
328+
"supports_function_calling": true,
329+
"supports_pdf_input": true,
330+
"supports_prompt_caching": true,
331+
"supports_reasoning": true,
332+
"supports_response_schema": true,
333+
"supports_video_input": true,
334+
"supports_vision": true
335+
},
336+
272337
"amazon.nova-micro-v1:0": {
273338
"input_cost_per_token": 3.5e-08,
274339
"litellm_provider": "bedrock_converse",

0 commit comments

Comments
 (0)