Skip to content

Commit 8f4bd5f

Browse files
authored
fix: conditionally enable LlamaRAMCache (#83)
1 parent b19963d commit 8f4bd5f

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

src/raglite/_litellm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
CreateChatCompletionStreamResponse,
3030
Llama,
3131
LlamaRAMCache,
32+
llama_supports_gpu_offload,
3233
)
3334

3435
from raglite._chatml_function_calling import chatml_function_calling_with_streaming
@@ -126,7 +127,8 @@ def llm(model: str, **kwargs: Any) -> Llama:
126127
**kwargs,
127128
)
128129
# Enable caching.
129-
llm.set_cache(LlamaRAMCache())
130+
if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8: # noqa: PLR2004
131+
llm.set_cache(LlamaRAMCache())
130132
# Register the model info with LiteLLM.
131133
model_info = {
132134
repo_id_filename: {

0 commit comments

Comments
 (0)