RCAccelerator · EmilienM · May 9, 2025 · May 9, 2025
diff --git a/src/chat.py b/src/chat.py
@@ -140,11 +140,9 @@ async def check_message_length(message_content: str) -> tuple[bool, str]:
         return False, "We've encountered an issue. Please try again later ..."
 
     if num_required_tokens > config.embeddings_llm_max_context:
-        # On average, a single token corresponds to approximately 4 characters.
-        # Because logs often require more tokens to process, we estimate 3
-        # characters per token.
+        # Calculate the maximum character limit estimation for the embedding model.
         approx_max_chars = round(
-            config.embeddings_llm_max_context * 3, -2)
+            config.embeddings_llm_max_context * config.chars_per_token_estimation, -2)
 
         error_message = (
             "⚠️ **Your input is too lengthy!**\n We can process inputs of up "

diff --git a/src/config.py b/src/config.py
@@ -51,6 +51,8 @@ class Config:
     prompt_header: str
     welcome_message: str
     jira_formatting_syntax_prompt: str
+    chars_per_token_estimation: int
+    generative_model_max_context_percentage: float
 
     @classmethod
     def from_env(cls) -> 'Config':
@@ -131,8 +133,25 @@ def from_env(cls) -> 'Config':
             # The maximum number of points we pass to the generative model after
             # reranking.
             rerank_top_n=int(os.environ.get("RERANK_TOP_N", 5)),
-        )
 
+            # The maximum percentage of the full context of the generative model
+            # that we can use. The idea is that we do not want to use the full
+            # available context to prevent decreased quality of the responses.
+            generative_model_max_context_percentage=float(os.environ.get(
+                "GENERATIVE_MODEL_MAX_CONTEXT_PERCENTAGE",
+                0.75,
+            )),
+
+            # The estimated number of characters per token we should use in our
+            # internal computations. For example, we use this value to estimate
+            # the maximum number of characters the generative model can process.
+            #
+            # On average, a single token corresponds to approximately 4 characters.
+            # Because logs often require more tokens to process, we estimate 3
+            # characters per token.
+            chars_per_token_estimation=int(os.environ.get(
+                "CHARS_PER_TOKEN_ESTIMATION", 3)),
+            )
 
 # Initialize the configuration
 config = Config.from_env()
diff --git a/src/prompt.py b/src/prompt.py
@@ -80,12 +80,10 @@ async def build_prompt(
 
     full_prompt_len += len(str(full_prompt))
 
-    # NOTE: On average, a single token corresponds to approximately 4 characters.
-    # Because logs often require more tokens to process, we estimate 3
-    # characters per token. Also, we do not want to use the full context
-    # of the model as trying to use the full context of the model might lead
-    # to decreased performance (0.75 constant).
-    approx_max_chars = config.generative_model_max_context * 3 * 0.75
+    # Calculate the maximum character limit estimation for the generative model.
+    approx_max_chars = (config.generative_model_max_context *
+                        config.generative_model_max_context_percentage *
+                        config.chars_per_token_estimation)
 
     # If no information was retrieved from the vector database, end the generation
     # of the prompt.