fix: conditionally enable LlamaRAMCache (#83)

lsorber · web-flow · commit 8f4bd5fcb095 · 2025-01-06T13:58:58.000+01:00
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -29,6 +29,7 @@
     CreateChatCompletionStreamResponse,
     Llama,
     LlamaRAMCache,
+    llama_supports_gpu_offload,
 )
 
 from raglite._chatml_function_calling import chatml_function_calling_with_streaming
@@ -126,7 +127,8 @@ def llm(model: str, **kwargs: Any) -> Llama:
                 **kwargs,
             )
         # Enable caching.
-        llm.set_cache(LlamaRAMCache())
+        if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8:  # noqa: PLR2004
+            llm.set_cache(LlamaRAMCache())
         # Register the model info with LiteLLM.
         model_info = {
             repo_id_filename: {