From ad95ff3c5edb2bb3f78838b3443c59d24520bb3c Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Fri, 9 May 2025 16:21:22 +0200 Subject: [PATCH] Add config options for max context size We use some constants to estimate the max context size in characters for the generative and embedding model. Let's make these configurable. --- src/chat.py | 6 ++---- src/config.py | 21 ++++++++++++++++++++- src/prompt.py | 10 ++++------ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/chat.py b/src/chat.py index cbed50a..2a3c6e4 100644 --- a/src/chat.py +++ b/src/chat.py @@ -140,11 +140,9 @@ async def check_message_length(message_content: str) -> tuple[bool, str]: return False, "We've encountered an issue. Please try again later ..." if num_required_tokens > config.embeddings_llm_max_context: - # On average, a single token corresponds to approximately 4 characters. - # Because logs often require more tokens to process, we estimate 3 - # characters per token. + # Calculate the maximum character limit estimation for the embedding model. approx_max_chars = round( - config.embeddings_llm_max_context * 3, -2) + config.embeddings_llm_max_context * config.chars_per_token_estimation, -2) error_message = ( "⚠️ **Your input is too lengthy!**\n We can process inputs of up " diff --git a/src/config.py b/src/config.py index 9b0b9ef..ed9df0a 100644 --- a/src/config.py +++ b/src/config.py @@ -51,6 +51,8 @@ class Config: prompt_header: str welcome_message: str jira_formatting_syntax_prompt: str + chars_per_token_estimation: int + generative_model_max_context_percentage: float @classmethod def from_env(cls) -> 'Config': @@ -131,8 +133,25 @@ def from_env(cls) -> 'Config': # The maximum number of points we pass to the generative model after # reranking. rerank_top_n=int(os.environ.get("RERANK_TOP_N", 5)), - ) + # The maximum percentage of the full context of the generative model + # that we can use. The idea is that we do not want to use the full + # available context to prevent decreased quality of the responses. + generative_model_max_context_percentage=float(os.environ.get( + "GENERATIVE_MODEL_MAX_CONTEXT_PERCENTAGE", + 0.75, + )), + + # The estimated number of characters per token we should use in our + # internal computations. For example, we use this value to estimate + # the maximum number of characters the generative model can process. + # + # On average, a single token corresponds to approximately 4 characters. + # Because logs often require more tokens to process, we estimate 3 + # characters per token. + chars_per_token_estimation=int(os.environ.get( + "CHARS_PER_TOKEN_ESTIMATION", 3)), + ) # Initialize the configuration config = Config.from_env() diff --git a/src/prompt.py b/src/prompt.py index 7434b97..28007fe 100644 --- a/src/prompt.py +++ b/src/prompt.py @@ -80,12 +80,10 @@ async def build_prompt( full_prompt_len += len(str(full_prompt)) - # NOTE: On average, a single token corresponds to approximately 4 characters. - # Because logs often require more tokens to process, we estimate 3 - # characters per token. Also, we do not want to use the full context - # of the model as trying to use the full context of the model might lead - # to decreased performance (0.75 constant). - approx_max_chars = config.generative_model_max_context * 3 * 0.75 + # Calculate the maximum character limit estimation for the generative model. + approx_max_chars = (config.generative_model_max_context * + config.generative_model_max_context_percentage * + config.chars_per_token_estimation) # If no information was retrieved from the vector database, end the generation # of the prompt.