From ad95ff3c5edb2bb3f78838b3443c59d24520bb3c Mon Sep 17 00:00:00 2001
From: Lukas Piwowarski <lpiwowar@redhat.com>
Date: Fri, 9 May 2025 16:21:22 +0200
Subject: [PATCH] Add config options for max context size

We use some constants to estimate the max context size in characters for
the generative and embedding model. Let's make these configurable.
---
 src/chat.py   |  6 ++----
 src/config.py | 21 ++++++++++++++++++++-
 src/prompt.py | 10 ++++------
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/chat.py b/src/chat.py
index cbed50a..2a3c6e4 100644
--- a/src/chat.py
+++ b/src/chat.py
@@ -140,11 +140,9 @@ async def check_message_length(message_content: str) -> tuple[bool, str]:
         return False, "We've encountered an issue. Please try again later ..."
 
     if num_required_tokens > config.embeddings_llm_max_context:
-        # On average, a single token corresponds to approximately 4 characters.
-        # Because logs often require more tokens to process, we estimate 3
-        # characters per token.
+        # Calculate the maximum character limit estimation for the embedding model.
         approx_max_chars = round(
-            config.embeddings_llm_max_context * 3, -2)
+            config.embeddings_llm_max_context * config.chars_per_token_estimation, -2)
 
         error_message = (
             "⚠️ **Your input is too lengthy!**\n We can process inputs of up "
diff --git a/src/config.py b/src/config.py
index 9b0b9ef..ed9df0a 100644
--- a/src/config.py
+++ b/src/config.py
@@ -51,6 +51,8 @@ class Config:
     prompt_header: str
     welcome_message: str
     jira_formatting_syntax_prompt: str
+    chars_per_token_estimation: int
+    generative_model_max_context_percentage: float
 
     @classmethod
     def from_env(cls) -> 'Config':
@@ -131,8 +133,25 @@ def from_env(cls) -> 'Config':
             # The maximum number of points we pass to the generative model after
             # reranking.
             rerank_top_n=int(os.environ.get("RERANK_TOP_N", 5)),
-        )
 
+            # The maximum percentage of the full context of the generative model
+            # that we can use. The idea is that we do not want to use the full
+            # available context to prevent decreased quality of the responses.
+            generative_model_max_context_percentage=float(os.environ.get(
+                "GENERATIVE_MODEL_MAX_CONTEXT_PERCENTAGE",
+                0.75,
+            )),
+
+            # The estimated number of characters per token we should use in our
+            # internal computations. For example, we use this value to estimate
+            # the maximum number of characters the generative model can process.
+            #
+            # On average, a single token corresponds to approximately 4 characters.
+            # Because logs often require more tokens to process, we estimate 3
+            # characters per token.
+            chars_per_token_estimation=int(os.environ.get(
+                "CHARS_PER_TOKEN_ESTIMATION", 3)),
+            )
 
 # Initialize the configuration
 config = Config.from_env()
diff --git a/src/prompt.py b/src/prompt.py
index 7434b97..28007fe 100644
--- a/src/prompt.py
+++ b/src/prompt.py
@@ -80,12 +80,10 @@ async def build_prompt(
 
     full_prompt_len += len(str(full_prompt))
 
-    # NOTE: On average, a single token corresponds to approximately 4 characters.
-    # Because logs often require more tokens to process, we estimate 3
-    # characters per token. Also, we do not want to use the full context
-    # of the model as trying to use the full context of the model might lead
-    # to decreased performance (0.75 constant).
-    approx_max_chars = config.generative_model_max_context * 3 * 0.75
+    # Calculate the maximum character limit estimation for the generative model.
+    approx_max_chars = (config.generative_model_max_context *
+                        config.generative_model_max_context_percentage *
+                        config.chars_per_token_estimation)
 
     # If no information was retrieved from the vector database, end the generation
     # of the prompt.