We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
LlamaRAMCache
1 parent b19963d commit 8f4bd5fCopy full SHA for 8f4bd5f
src/raglite/_litellm.py
@@ -29,6 +29,7 @@
29
CreateChatCompletionStreamResponse,
30
Llama,
31
LlamaRAMCache,
32
+ llama_supports_gpu_offload,
33
)
34
35
from raglite._chatml_function_calling import chatml_function_calling_with_streaming
@@ -126,7 +127,8 @@ def llm(model: str, **kwargs: Any) -> Llama:
126
127
**kwargs,
128
129
# Enable caching.
- llm.set_cache(LlamaRAMCache())
130
+ if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8: # noqa: PLR2004
131
+ llm.set_cache(LlamaRAMCache())
132
# Register the model info with LiteLLM.
133
model_info = {
134
repo_id_filename: {
0 commit comments