From d726ecd7895abaa4475b931350d14db361823835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ir=C3=ADn=20Duffy?= Date: Wed, 8 Oct 2025 23:40:54 -0400 Subject: [PATCH 1/5] Add MMLU-style multiple choice evaluation metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements two evaluation metrics for MMLU-style multiple choice questions: - mmlu_exact_match: Flexible letter extraction with regex patterns - mmlu_strict_match: Strict single-letter exact matching The metrics handle various response formats: - Direct letter answers: "B" - Sentence responses: "The answer is B" - Formatted responses: "B) Code can survive..." 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Máirín Duffy --- .../core/metrics/mmlu_style_eval.py | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py diff --git a/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py b/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py new file mode 100644 index 00000000..adc680ee --- /dev/null +++ b/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py @@ -0,0 +1,215 @@ +"""MMLU-style multiple choice evaluation metrics.""" + +import re +from typing import Any, Optional + +from lightspeed_evaluation.core.models import EvaluationScope, TurnData + + +class MultipleChoiceExactMatch: # pylint: disable=too-few-public-methods + """Exact match metric for multiple choice questions (MMLU-style scoring). + + Returns 1.0 for correct answer, 0.0 for incorrect. + """ + + def __init__(self, threshold: float = 1.0) -> None: + """Initialize metric. + + Args: + threshold: Score threshold for passing (default: 1.0, meaning must be exact). + """ + self.threshold = threshold + + def evaluate( # pylint: disable=unused-argument + self, response: str, expected_response: str, **kwargs: Any + ) -> dict[str, Any]: + """Evaluate if the AI response matches the expected answer. + + Args: + response: The AI's generated response. + expected_response: The correct answer (e.g., "A", "B", "C", or "D"). + **kwargs: Additional arguments (ignored). + + Returns: + Dict with 'score' (1.0 or 0.0) and 'reason' (explanation). + """ + # Clean inputs + response_clean = response.strip().upper() + expected_clean = expected_response.strip().upper() + + # Extract letter from response using regex + # Handles cases like: + # - "B" + # - "The answer is B" + # - "B) Code can survive..." + # - "I think B is correct" + letter_match = re.search(r"\b([ABCD])\b", response_clean) + + if letter_match: + response_letter = letter_match.group(1) + else: + # No clear letter found, try first character + response_letter = response_clean[0] if response_clean else "" + + # Compare + is_correct = response_letter == expected_clean + score = 1.0 if is_correct else 0.0 + + # Build explanation + reason = ( + f"Expected: {expected_clean} | " + f"Extracted: {response_letter} | " + f"Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'} | " + f"Full response: '{response[:100]}...'" + if len(response) > 100 + else f"Full response: '{response}'" + ) + + return {"score": score, "reason": reason} + + +class MultipleChoiceStrictMatch: # pylint: disable=too-few-public-methods + """Stricter version requiring response to be exactly A, B, C, or D.""" + + def __init__(self, threshold: float = 1.0) -> None: + """Initialize metric. + + Args: + threshold: Score threshold for passing (default: 1.0). + """ + self.threshold = threshold + + def evaluate( # pylint: disable=unused-argument + self, response: str, expected_response: str, **kwargs: Any + ) -> dict[str, Any]: + """Evaluate if response exactly matches expected answer. + + Args: + response: The AI's generated response. + expected_response: The correct answer (single letter). + **kwargs: Additional arguments (ignored). + + Returns: + Dict with 'score' (1.0 or 0.0) and 'reason' (explanation). + """ + response_clean = response.strip().upper() + expected_clean = expected_response.strip().upper() + + # Must be exactly one letter + is_correct = response_clean == expected_clean and len(response_clean) == 1 + score = 1.0 if is_correct else 0.0 + + return { + "score": score, + "reason": f"Expected exactly '{expected_clean}', got '{response_clean}'", + } + + +class MMLUMetrics: # pylint: disable=too-few-public-methods + """Custom MMLU-style metrics integrated with the evaluation framework.""" + + def __init__(self) -> None: + """Initialize MMLU metrics.""" + self.exact_match = MultipleChoiceExactMatch() + self.strict_match = MultipleChoiceStrictMatch() + + self.supported_metrics = { + "mmlu_exact_match": self._evaluate_exact_match, + "mmlu_strict_match": self._evaluate_strict_match, + } + + def evaluate( + self, + metric_name: str, + conv_data: Any, + scope: EvaluationScope, + ) -> tuple[Optional[float], str]: + """Evaluate an MMLU-style metric. + + Args: + metric_name: Name of the metric to evaluate. + conv_data: Conversation data (unused for MMLU metrics). + scope: Evaluation scope containing turn data. + + Returns: + Tuple of (score, reason) where score is between 0.0 and 1.0. + """ + if metric_name not in self.supported_metrics: + return None, f"Unsupported MMLU metric: {metric_name}" + + try: + return self.supported_metrics[metric_name]( + conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation + ) + except (ValueError, AttributeError, KeyError) as e: + return None, f"MMLU {metric_name} evaluation failed: {str(e)}" + + def _evaluate_exact_match( + self, + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using exact match with flexible letter extraction. + + Args: + _conv_data: Conversation data (unused). + _turn_idx: Turn index (unused). + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + if is_conversation: + return None, "MMLU exact match is a turn-level metric" + + if turn_data is None: + return None, "TurnData is required for MMLU evaluation" + + if not turn_data.response: + return None, "Response is required for MMLU evaluation" + + if not turn_data.expected_response: + return None, "Expected response is required for MMLU evaluation" + + result = self.exact_match.evaluate( + turn_data.response, turn_data.expected_response + ) + return result["score"], result["reason"] + + def _evaluate_strict_match( + self, + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using strict exact match (single letter only). + + Args: + _conv_data: Conversation data (unused). + _turn_idx: Turn index (unused). + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + if is_conversation: + return None, "MMLU strict match is a turn-level metric" + + if turn_data is None: + return None, "TurnData is required for MMLU evaluation" + + if not turn_data.response: + return None, "Response is required for MMLU evaluation" + + if not turn_data.expected_response: + return None, "Expected response is required for MMLU evaluation" + + result = self.strict_match.evaluate( + turn_data.response, turn_data.expected_response + ) + return result["score"], result["reason"] From 0cdee14326e06f9468f42ca0b086670a70ebb6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ir=C3=ADn=20Duffy?= Date: Thu, 9 Oct 2025 05:24:54 -0400 Subject: [PATCH 2/5] Register MMLU metrics with evaluation framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrates the MMLU-style multiple choice metrics into the CustomMetrics handler and adds metric definitions to system configuration. Changes: - Import MMLUMetrics in CustomMetrics class - Register multiple_choice_exact and multiple_choice_strict metrics - Add wrapper methods to delegate to MMLUMetrics evaluator - Update metric names from mmlu_* to multiple_choice_* for consistency - Add metric metadata to system.yaml for validation The metrics are now accessible via: - custom:multiple_choice_exact (flexible letter extraction) - custom:multiple_choice_strict (exact single-letter matching) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Máirín Duffy --- config/system.yaml | 31 ++++++++++++------- .../core/metrics/custom/custom.py | 4 +++ .../core/metrics/mmlu_style_eval.py | 4 +-- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/config/system.yaml b/config/system.yaml index bc45a079..52a23aed 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -2,8 +2,8 @@ # LLM as a judge configuration llm: - provider: "openai" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..) - model: "gpt-4o-mini" # Model name for the provider + provider: "gemini" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..) + model: "gemini-2.5-flash" # Model name for the provider temperature: 0.0 # Generation temperature max_tokens: 512 # Maximum tokens in response timeout: 300 # Request timeout in seconds @@ -11,29 +11,28 @@ llm: cache_dir: ".caches/llm_cache" # Directory with LLM cache cache_enabled: true # Is LLM cache enabled? -# Default embedding (for LLM as a judge) configuration: +# Default embedding configuration: embedding: - provider: "openai" - model: "text-embedding-3-small" + provider: "huggingface" + model: "sentence-transformers/all-MiniLM-L6-v2" provider_kwargs: {} cache_dir: ".caches/embedding_cache" cache_enabled: true - # API Configuration # To get real time data. Currently it supports lightspeed-stack API. # But can be easily integrated with other APIs with minimal change. api: enabled: true # Enable API calls instead of using pre-filled data - api_base: http://localhost:8080 # Base API URL - endpoint_type: streaming # Use "streaming" or "query" endpoint + api_base: http://localhost:8000 # Base API URL + endpoint_type: query # Use "streaming" or "query" endpoint timeout: 300 # API request timeout in seconds # API input configuration - provider: "openai" # LLM provider for queries - model: "gpt-4o-mini" # Model to use for queries - no_tools: null # Whether to bypass tools and MCP servers (optional) - system_prompt: null # System prompt (default None) +# provider: "openai" # LLM provider for queries +# model: "qwen2.5:7b-instruct" # Model to use for queries +# no_tools: null # Whether to bypass tools and MCP servers (optional) +# system_prompt: null # System prompt (default None) cache_dir: ".caches/api_cache" # Directory with lightspeed-stack cache cache_enabled: true # Is lightspeed-stack cache enabled? @@ -83,6 +82,14 @@ metrics_metadata: "custom:tool_eval": description: "Tool call evaluation comparing expected vs actual tool calls" + "custom:multiple_choice_exact": + threshold: 1.0 + description: "MMLU-style multiple choice exact match with flexible letter extraction" + + "custom:multiple_choice_strict": + threshold: 1.0 + description: "MMLU-style multiple choice strict match (single letter only)" + # Script-based metrics "script:action_eval": description: "Script-based evaluation for infrastructure/environment validation" diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py index 292ec335..d810e968 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py @@ -26,11 +26,14 @@ def __init__(self, llm_manager: LLMManager): self.llm = BaseCustomLLM( llm_manager.get_model_name(), llm_manager.get_llm_params() ) + self.mmlu_metrics = MMLUMetrics() self.supported_metrics = { "answer_correctness": self._evaluate_answer_correctness, "intent_eval": self._evaluate_intent, "tool_eval": self._evaluate_tool_calls, + "multiple_choice_exact": self._evaluate_mmlu_exact, + "multiple_choice_strict": self._evaluate_mmlu_strict, } print(f"✅ Custom Metrics initialized: {self.llm.model_name}") @@ -241,3 +244,4 @@ def _evaluate_intent( return score, reason except LLMError as e: return None, f"Intent evaluation failed: {str(e)}" + diff --git a/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py b/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py index adc680ee..4cd1506a 100644 --- a/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py +++ b/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py @@ -114,8 +114,8 @@ def __init__(self) -> None: self.strict_match = MultipleChoiceStrictMatch() self.supported_metrics = { - "mmlu_exact_match": self._evaluate_exact_match, - "mmlu_strict_match": self._evaluate_strict_match, + "multiple_choice_exact": self._evaluate_exact_match, + "multiple_choice_strict": self._evaluate_strict_match, } def evaluate( From 2083f5f0a7c9afddeba26218013e1cf6590d5b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ir=C3=ADn=20Duffy?= Date: Thu, 9 Oct 2025 05:27:08 -0400 Subject: [PATCH 3/5] Add MMLU evaluation example YAML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provides example configuration for MMLU-style multiple choice evaluations demonstrating the custom:multiple_choice_exact metric usage. Example includes: - Multi-turn conversation with Red Hat training questions - Questions covering Vim editor and file management - Expected responses (A, B, C, D format) - Response field set to null for API-based evaluation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Máirín Duffy --- config/mmlu_example.yaml | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 config/mmlu_example.yaml diff --git a/config/mmlu_example.yaml b/config/mmlu_example.yaml new file mode 100644 index 00000000..751f366f --- /dev/null +++ b/config/mmlu_example.yaml @@ -0,0 +1,48 @@ +- conversation_group_id: rh199_9.0_ch01 + description: Vim editor and file management + turns: + - turn_id: 849c1899-e729-4387-b5e4-7b87f4bfadc9 + query: 'What is the difference between the `vim-minimal` and `vim-enhanced` packages in Red Hat Enterprise Linux, and what features do they provide for editing text-based files? + + + A) `vim-enhanced` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-minimal` offers a lightweight installation with core features and the basic `vi` command. + + B) `vim-enhanced` and `vim-minimal` provide the same set of features with slight variations in the online help system and tutorial program. + + C) `vim-minimal` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-enhanced` offers a lightweight installation with core features and the basic `vi` command. + + D) `vim-enhanced` provides a lightweight installation with core features and the basic `vi` command, while `vim-minimal` offers a more comprehensive set of features with additional plugins.' + expected_response: A + response: null + turn_metrics: + - custom:multiple_choice_exact + - turn_id: 4320ff94-bf75-4dfc-b7d9-38f704ccf47d + query: 'How can you open a file for editing using the `vi` and `vim` commands in Red Hat Enterprise Linux? + + + A) vi --help or vim --help + + B) vi -r filename or vim -r filename + + C) vi filename or vim filename + + D) vi -w filename or vim -w filename' + expected_response: C + response: null + turn_metrics: + - custom:multiple_choice_exact + - turn_id: 68ddb5b7-c16c-4de2-afb6-a736c548fd52 + query: 'What are the different modes of operation in the Vim editor, and how can you move between them? + + + A) Vim has command mode, insert mode, visual mode, and extended command mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + B) Vim has command mode, insert mode, select mode, and extended command mode. You can move between them using ''i'' to enter select mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + C) Vim has edit mode, insert mode, visual mode, and extended command mode. You can move between them using ''e'' to enter edit mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + D) Vim has command mode, insert mode, visual mode, and search mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and ''/'' to enter search mode from command mode.' + expected_response: A + response: null + turn_metrics: + - custom:multiple_choice_exact From 09e83556b67f084de64d3e5ffc64390943c93070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ir=C3=ADn=20Duffy?= Date: Fri, 10 Oct 2025 13:31:28 -0400 Subject: [PATCH 4/5] Integrate mmlu-style with custom metric updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-rebase of mmlu-style-eval with custom metric updates. Signed-off-by: Máirín Duffy --- .../core/metrics/custom/custom.py | 47 +++++++++++++++++++ .../metrics/{ => custom}/mmlu_style_eval.py | 0 2 files changed, 47 insertions(+) rename src/lightspeed_evaluation/core/metrics/{ => custom}/mmlu_style_eval.py (100%) diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py index d810e968..4073bb2f 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py @@ -5,6 +5,7 @@ from lightspeed_evaluation.core.llm.custom import BaseCustomLLM from lightspeed_evaluation.core.llm.manager import LLMManager +from lightspeed_evaluation.core.metrics.custom.mmlu_style_eval import MMLUMetrics from lightspeed_evaluation.core.metrics.custom.prompts import ( ANSWER_CORRECTNESS_PROMPT, INTENT_EVALUATION_PROMPT, @@ -245,3 +246,49 @@ def _evaluate_intent( except LLMError as e: return None, f"Intent evaluation failed: {str(e)}" + def _evaluate_mmlu_exact( + self, + conv_data: Any, + turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using MMLU exact match metric. + + Args: + conv_data: Conversation data. + turn_idx: Turn index. + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + scope = EvaluationScope( + turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation + ) + return self.mmlu_metrics.evaluate("multiple_choice_exact", conv_data, scope) + + def _evaluate_mmlu_strict( + self, + conv_data: Any, + turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using MMLU strict match metric. + + Args: + conv_data: Conversation data. + turn_idx: Turn index. + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + scope = EvaluationScope( + turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation + ) + return self.mmlu_metrics.evaluate("multiple_choice_strict", conv_data, scope) + diff --git a/src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py b/src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py similarity index 100% rename from src/lightspeed_evaluation/core/metrics/mmlu_style_eval.py rename to src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py From c2d53e38ffa5670d02d52e452984a58c67457949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ir=C3=ADn=20Duffy?= Date: Fri, 10 Oct 2025 18:15:47 -0400 Subject: [PATCH 5/5] Update system.yaml with clean defaults and MMLU metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reset config to upstream defaults (openai provider, standard settings) and add MMLU multiple choice metric definitions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Máirín Duffy --- config/system.yaml | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/config/system.yaml b/config/system.yaml index 52a23aed..1c7045f8 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -2,8 +2,8 @@ # LLM as a judge configuration llm: - provider: "gemini" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..) - model: "gemini-2.5-flash" # Model name for the provider + provider: "openai" # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..) + model: "gpt-4o-mini" # Model name for the provider temperature: 0.0 # Generation temperature max_tokens: 512 # Maximum tokens in response timeout: 300 # Request timeout in seconds @@ -11,28 +11,29 @@ llm: cache_dir: ".caches/llm_cache" # Directory with LLM cache cache_enabled: true # Is LLM cache enabled? -# Default embedding configuration: +# Default embedding (for LLM as a judge) configuration: embedding: - provider: "huggingface" - model: "sentence-transformers/all-MiniLM-L6-v2" + provider: "openai" + model: "text-embedding-3-small" provider_kwargs: {} cache_dir: ".caches/embedding_cache" cache_enabled: true + # API Configuration # To get real time data. Currently it supports lightspeed-stack API. # But can be easily integrated with other APIs with minimal change. api: enabled: true # Enable API calls instead of using pre-filled data - api_base: http://localhost:8000 # Base API URL - endpoint_type: query # Use "streaming" or "query" endpoint + api_base: http://localhost:8080 # Base API URL + endpoint_type: streaming # Use "streaming" or "query" endpoint timeout: 300 # API request timeout in seconds # API input configuration -# provider: "openai" # LLM provider for queries -# model: "qwen2.5:7b-instruct" # Model to use for queries -# no_tools: null # Whether to bypass tools and MCP servers (optional) -# system_prompt: null # System prompt (default None) + provider: "openai" # LLM provider for queries + model: "gpt-4o-mini" # Model to use for queries + no_tools: null # Whether to bypass tools and MCP servers (optional) + system_prompt: null # System prompt (default None) cache_dir: ".caches/api_cache" # Directory with lightspeed-stack cache cache_enabled: true # Is lightspeed-stack cache enabled?