From a77e81faadf686f0e54dd9d868372217a89428bd Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Tue, 29 Jul 2025 17:03:47 -0700 Subject: [PATCH 01/10] Integrating the Re-prompting pipeline into the SDK --- aimon/reprompting_api/_init_.py | 0 aimon/reprompting_api/config.py | 59 ++ aimon/reprompting_api/pipeline.py | 543 ++++++++++++++++++ aimon/reprompting_api/reprompter.py | 218 +++++++ aimon/reprompting_api/runner.py | 76 +++ aimon/reprompting_api/telemetry.py | 75 +++ aimon/reprompting_api/tests/_init_.py | 0 .../tests/test_reprompting_cases.py | 172 ++++++ .../tests/test_reprompting_failures.py | 123 ++++ .../tests/test_reprompting_success.py | 106 ++++ aimon/reprompting_api/utils.py | 185 ++++++ 11 files changed, 1557 insertions(+) create mode 100644 aimon/reprompting_api/_init_.py create mode 100644 aimon/reprompting_api/config.py create mode 100644 aimon/reprompting_api/pipeline.py create mode 100644 aimon/reprompting_api/reprompter.py create mode 100644 aimon/reprompting_api/runner.py create mode 100644 aimon/reprompting_api/telemetry.py create mode 100644 aimon/reprompting_api/tests/_init_.py create mode 100644 aimon/reprompting_api/tests/test_reprompting_cases.py create mode 100644 aimon/reprompting_api/tests/test_reprompting_failures.py create mode 100644 aimon/reprompting_api/tests/test_reprompting_success.py create mode 100644 aimon/reprompting_api/utils.py diff --git a/aimon/reprompting_api/_init_.py b/aimon/reprompting_api/_init_.py new file mode 100644 index 0000000..e69de29 diff --git a/aimon/reprompting_api/config.py b/aimon/reprompting_api/config.py new file mode 100644 index 0000000..ba2bae8 --- /dev/null +++ b/aimon/reprompting_api/config.py @@ -0,0 +1,59 @@ +import os +from typing import Optional +from dataclasses import dataclass +import random +import string + +def generate_random_string(length: int) -> str: + """Generates a random string of letters and digits.""" + if not isinstance(length, int) or length <= 0: + raise ValueError("Length must be a positive integer.") + characters = string.ascii_letters + string.digits + return ''.join(random.choice(characters) for i in range(length)) + +class StopReasons: + ALL_INSTRUCTIONS_ADHERED = "all_instructions_adhered" + MAX_ITERATIONS_REACHED = "max_iterations_reached" + CONTINUE = "instructions_failed_continue_reprompting" + CONTINUE_TOXICITY = "toxicity_detect_continue_reprompting" + + ## limits + LATENCY_LIMIT_EXCEEDED = "latency_limit_exceeded" + + ##errors + REPROMPTING_FAILED = "reprompting_failed" + UNKNOWN_ERROR = "unknown_error" + +@dataclass +class RepromptingConfig: + """ + Configuration for the automated re-prompting pipeline. + + Attributes: + publish (bool): Whether to publish results to app.aimon.ai. + max_iterations (int): Maximum number of re-prompting iterations (1 initial + N retries). + aimon_api_key (Optional[str]): API key for AIMon integration. Defaults to "AIMON_API_KEY" env var. + model_name (Optional[str]): Model identifier for telemetry. Defaults to "aimon-react-model-{rand}". + application_name (Optional[str]): Application identifier for telemetry. Defaults to "aimon-react-application-{rand}". + return_telemetry (bool): Whether to include per-iteration telemetry in the response. + return_aimon_summary (bool): Whether to include a human-readable caption summarizing re-prompting. (e.g.: 2 iterations, 0 failed instructions) + latency_limit_ms (Optional[int]): Maximum cumulative latency (ms) before aborting. None = no limit. + user_model_max_retries (Optional[int]): Max retries for user model calls. Defaults to 2. + feedback_model_max_retries (Optional[int]): Max retries for feedback model calls. Defaults to 2. + """ + publish: bool = False + max_iterations: int = 2 + if max_iterations < 1: + raise ValueError("Max iterations must be greater than 0") + aimon_api_key: Optional[str] = os.getenv("AIMON_API_KEY") or "default_api_key" + if aimon_api_key == "default_api_key": + raise ValueError("AIMON_API_KEY environment variable is not set and no fallback value is provided.") + model_name: Optional[str] = "aimon-react-model-" + generate_random_string(5) + application_name: Optional[str] = "aimon-react-application-" + generate_random_string(5) + return_telemetry: bool = False + return_aimon_summary: bool = False + latency_limit_ms: Optional[int] = None + user_model_max_retries: Optional[int] = 2 + feedback_model_max_retries: Optional[int] = 2 + + \ No newline at end of file diff --git a/aimon/reprompting_api/pipeline.py b/aimon/reprompting_api/pipeline.py new file mode 100644 index 0000000..61de0e8 --- /dev/null +++ b/aimon/reprompting_api/pipeline.py @@ -0,0 +1,543 @@ +from aimon.reprompting_api.config import RepromptingConfig, StopReasons +from aimon.reprompting_api.telemetry import TelemetryLogger +from aimon.reprompting_api.reprompter import Reprompter +from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score +from aimon import Detect +import time +import random +from string import Template +import logging + +logger = logging.getLogger(__name__) + +class RepromptingPipeline: + """ + A pipeline for iterative re-prompting of LLM responses using AIMon evaluation. + + This pipeline orchestrates: + - Initial prompt generation for a given query, context, and user instructions. + - Interaction with a black-box LLM to generate responses. + - Evaluation of responses using AIMon detectors (instruction adherence, groundedness, toxicity). + - Iterative corrective re-prompting until stopping conditions are met. + - Collection and emission of telemetry for all iterations. + + **Expected LLM function signature**: + llm_fn(recommended_prompt_template: Template, system_prompt: str, context: str, user_query: str) -> str + + Attributes: + llm_fn (callable): Function to call the LLM. Must be a Callable with + -recommended_prompt_template: Template + -system_prompt: str + -context: str + -user_query: str + + config (RepromptingConfig): Configuration object with API keys and iteration limits. + reprompter (Reprompter): Utility for generating corrective prompts based on evaluation feedback. + telemetry (TelemetryLogger): Logger for capturing telemetry data. + detect (Detect): AIMon detection client for evaluating model responses. + + Returns: + dict: + { + "best_response" (str): Best model response across all iterations. + "telemetry" (list, optional): Iteration-level telemetry if enabled. + "summary" (str, optional): Human-readable run summary if enabled. + } + """ + def __init__(self, llm_fn, config): + """ + Initialize pipeline with LLM callable and RepromptingConfig. + + Args: + llm_fn (callable): Function to call the LLM. + Signature: llm_fn(recommended_prompt_template: Template, system_prompt: str, context: str, user_query: str) -> str + config (RepromptingConfig): Configuration object with API keys and limits. + + """ + self.llm_fn = llm_fn + self.config = config or RepromptingConfig() + + # Utilities for reprompting, telemetry, and scoring + self.reprompter = Reprompter() + self.telemetry = TelemetryLogger() + + # Initialize AIMon Detect for response evaluation + self.detect = Detect( + values_returned=['user_query', 'instructions', 'generated_text', 'context'], + config={ + "instruction_adherence": { + "detector_name": "default", + "explain": True, + "extract_from_system": False + }, + "groundedness": { + "detector_name": "default", + "explain": True + }, + "toxicity": { + "detector_name": "default", + "explain": True + } + }, + api_key=self.config.aimon_api_key, + application_name = self.config.application_name, + model_name = self.config.model_name, + publish=self.config.publish + ) + + def run(self, system_prompt: str, context: str, user_query: str, user_instructions): + """ + Execute the full re-prompting pipeline. + + Process: + 1. Build an initial prompt with query, context, and instructions. + 2. Call the LLM to generate a response. + 3. Evaluate the response with AIMon detectors. + 4. If violations are found, iteratively generate corrective prompts and re-prompt the LLM. + 5. Stop when all instructions are followed or iteration limits are reached. + 6. Return the best response (lowest residual error) along with telemetry and a summary if configured. + + Args: + user_query (str): The user's query or instruction. + context (str): Contextual information to include in the prompt. + user_instructions (list[str]): Instructions the model must follow. + + Returns: + dict: + { + "best_response" (str): Best model response from all iterations. + "telemetry" (list, optional): Telemetry for all iterations if enabled. + "summary" (str, optional): Summary of the process if enabled. + } + """ + logger.info("Starting RepromptingPipeline run") + logger.debug(f"Inputs - System Prompt: {system_prompt}, Context: {context}, User Query: {user_query}, Instructions: {user_instructions}") + iteration_outputs = {} # key: iteration number → dict(response_text, residual_error_score, failed_instructions_count) + pipeline_start = time.time() + iteration_num = 1 + + curr_prompt = self._build_original_prompt() + logger.debug(f"Initial prompt template built: {curr_prompt.template}") + + + # First LLM call + curr_generated_text = self._call_llm(curr_prompt,self.config.user_model_max_retries, system_prompt, context, user_query) + logger.debug(f"Initial LLM response: {curr_generated_text}") + + + # Evaluate response with AIMon + curr_payload = self._build_aimon_payload(context, user_query, user_instructions, curr_generated_text, system_prompt) + curr_result = self._detect_aimon_response(curr_payload, self.config.feedback_model_max_retries) + logger.debug(f"AIMon evaluation result: {curr_result}") + + # Get scores and detailed feedback on failured instructions + scores, feedback = self.get_response_feedback(curr_result) + self._record_iteration_output(iteration_outputs, iteration_num, curr_generated_text, curr_result) + + # Iteratively re-prompt until conditions are met or limits reached + stop_reason = None + while True: + should_stop, stop_reason = self._should_stop_reprompting(curr_result, iteration_num, pipeline_start) + logger.info(f"Iteration {iteration_num}: Stop decision: {should_stop}, Reason: {stop_reason}") + if should_stop: + break + + # Emit telemetry for this iteration + self._emit_iteration_telemetry( + iteration_num, + pipeline_start, + scores, + feedback, + curr_result, + stop_reason or StopReasons.CONTINUE, + curr_prompt, + curr_generated_text, + ) + + # Generate corrective prompt + curr_prompt = self._build_corrective_prompt(curr_payload, curr_result) + + # Retry LLM call with corrective prompt + curr_generated_text = self._call_llm(curr_prompt,self.config.user_model_max_retries, system_prompt, context, user_query) + # Re-evaluate the new response + curr_payload = self._build_aimon_payload(context, user_query, user_instructions, curr_generated_text, system_prompt) + curr_result = self._detect_aimon_response(curr_payload, self.config.feedback_model_max_retries) + + # Extract updated scores and feedback + scores, feedback = self.get_response_feedback(curr_result) + iteration_num += 1 + self._record_iteration_output(iteration_outputs, iteration_num, curr_generated_text, curr_result) + + # Final telemetry after loop exit + self._emit_iteration_telemetry( + iteration_num, + pipeline_start, + scores, + feedback, + curr_result, + stop_reason or StopReasons.UNKNOWN_ERROR, + curr_prompt, + curr_generated_text, + ) + + # Select best response across all iterations + best_output, best_failed_count = self._select_best_iteration(iteration_outputs) + + # Build final response payload + response = {"best_response": best_output} + if self.config.return_telemetry: + response["telemetry"] = self.telemetry.get_all() + if self.config.return_aimon_summary: + response["summary"] = self._gen_summary(iteration_num, best_failed_count) + + logger.info("RepromptingPipeline run completed") + logger.info(f"Best response selected with {best_failed_count} failed instructions remaining.") + + return response + + def _build_original_prompt(self) -> Template: + """ + Build a reusable template for combining system_prompt, context, and user_query. + This returns a string.Template object so the caller can safely substitute values. + + Placeholders: + - system_prompt + - context + - user_query + + Returns: + Template: A string.Template for building the base LLM prompt. + """ + template_str = ( + "System:\n${system_prompt}\n\n" + "Context:\n${context}\n\n" + "User Query:\n${user_query}" + ) + return Template(template_str) + + def _build_aimon_payload(self, context, user_query, user_instructions, generated_text, system_prompt): + """ + Constructs AIMon input payload. + + Args: + context (str): Context for the LLM. + user_query (str): The user's query. + user_instructions (list[str]): Instructions for the model. + generated_text (str): The model's generated response. + + Returns: + dict: Payload for AIMon evaluation. + """ + if not isinstance(user_instructions, list): + user_instructions = [] + payload = { + 'context': context, + 'user_query': user_query, + 'generated_text': generated_text, + 'instructions': user_instructions, + 'system_prompt' : system_prompt + } + return payload + + def _call_llm(self, prompt_template: Template, max_attempts, system_prompt=None, context=None, user_query=None, base_delay=1): + """ + Calls the LLM with exponential backoff. Retries if the LLM call fails + OR returns a non-string value. Raises an exception if all retries fail. + + Args: + prompt_template (Template): Prompt template for the LLM. + max_attempts (int): Max retry attempts. + base_delay (float): Initial delay in seconds before backoff. + + Returns: + str: LLM response text. + + Raises: + RuntimeError: If the LLM call fails or returns an invalid type after all retries. + """ + last_exception = None + for attempt in range(max_attempts): + try: + logger.debug(f"LLM call attempt {attempt+1} with prompt template.") + result = self.llm_fn(prompt_template, system_prompt, context, user_query) + # Validate type + if not isinstance(result, str): + raise TypeError(f"LLM returned invalid type {type(result).__name__}, expected str.") + return result + except Exception as e: + last_exception = e + logger.warning(f"LLM call failed on attempt {attempt+1}: {e}") + wait_time = base_delay * (2 ** attempt) + random.uniform(0, 0.1) + time.sleep(wait_time) + raise RuntimeError(f"LLM call failed or returned invalid type after maximum retries. Last error: {last_exception}") + + def _detect_aimon_response(self, payload, max_attempts, base_delay=1): + """ + Calls AIMon Detect with exponential backoff and returns the detection result. + + This method wraps the AIMon evaluation call, retrying if it fails due to transient + errors (e.g., network issues or temporary service unavailability). It retries up to + `max_attempts` times with exponential backoff before raising a RuntimeError. + + Args: + payload (dict): A dictionary containing 'context', 'user_query', + 'instructions', and 'generated_text' for evaluation. + max_attempts (int): Maximum number of retry attempts. + base_delay (float): Initial delay in seconds before exponential backoff. + + Returns: + object: The AIMon detection result containing evaluation scores and feedback. + + Raises: + RuntimeError: If AIMon Detect fails after all retry attempts. + """ + aimon_context = f"{payload['context']}\n\nUser Query:\n{payload['user_query']}" + aimon_query = f"{payload['user_query']}\n\nInstructions:\n{payload['instructions']}" + + @self.detect + def run_detection(query, instructions, generated_text, context): + return query, instructions, generated_text, context + + for attempt in range(max_attempts): + try: + logger.debug(f"AIMon detect attempt {attempt+1} with payload: {payload}") + _, _, _, _, result = run_detection( + aimon_query, + payload['instructions'], + payload['generated_text'], + aimon_context + ) + return result + except Exception as e: + logger.debug(f"AIMon detect failed on attempt {attempt+1}: {e}") + wait_time = base_delay * (2 ** attempt) + random.uniform(0, 0.1) + time.sleep(wait_time) + raise RuntimeError("AIMon detect call failed after maximum retries.") + + def get_response_feedback(self, result): + """ + Extract groundedness and instruction adherence scores and failed instructions. + + Args: + result (object): AIMon detection result. + + Returns: + tuple: (scores (dict), failed_instructions (list)) + """ + scores = { + "groundedness": result.detect_response.groundedness.get("score", 0.0), + "instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0) + } + feedback = get_failed_instructions(result) + return scores, feedback + + def _build_corrective_prompt(self, payload, result): + """ + Generate a corrective prompt using AIMon evaluation results. + + Args: + payload (dict): AIMon input payload. + result (object): AIMon detection result. + + Returns: + str: A corrective prompt for re-prompting the LLM. + """ + return self.reprompter.create_corrective_prompt(result, payload) + + def _should_stop_reprompting(self, result, iteration_num, pipeline_start): + """ + Determine whether to stop re-prompting. + + Stopping conditions: + - Max iterations reached. + - Latency budget 75% depleted + - All instructions are adhered to. + - Otherwise, continue if violations or toxicity remain. + + Args: + result (object): AIMon detection result. + iteration_num (int): Current iteration number. + + Returns: + tuple: + (should_stop (bool), stop_reason (str or None)) + """ + # Max iterations reached + if iteration_num >= self.config.max_iterations: + return True, StopReasons.MAX_ITERATIONS_REACHED + + latency_limit_ms = self.config.latency_limit_ms + if latency_limit_ms is not None: + cumulative_latency = self._get_cumulative_latency(pipeline_start) + if cumulative_latency > ((0.75) * latency_limit_ms): + return True, StopReasons.LATENCY_LIMIT_EXCEEDED + + # Continue if toxicity is detected + if toxicity_check(result): + return False, StopReasons.CONTINUE_TOXICITY + + # Continue if there are still failed instructions + if get_failed_instructions_count(result) > 0: + return False, StopReasons.CONTINUE + + # All instructions followed + return True, StopReasons.ALL_INSTRUCTIONS_ADHERED + + def _select_best_iteration(self, iteration_outputs): + """ + Selects the best iteration based on the lowest residual error score. + + Args: + iteration_outputs (dict): Mapping of iteration_num -> iteration data. + + Returns: + tuple: (best_output (str), best_failed_count (int)) + """ + valid_iterations = [ + entry for entry in iteration_outputs.values() + if isinstance(entry.get("residual_error_score"), (int, float)) + ] + if not valid_iterations: + return "[ERROR: No valid response]", None + + best_iteration = min(valid_iterations, key=lambda x: x["residual_error_score"]) + return best_iteration["response_text"], best_iteration["failed_instructions_count"] + + def _gen_summary(self, iteration_num, best_failed_count): + """ + Generate a human-readable summary for the pipeline run. + e.g.: "2 iterations, 0 failed instructions remaining" + + Args: + iteration_num (int): Number of iterations performed. + best_failed_count (int): Number of failed instructions in the best response. + + Returns: + str: Summary. + """ + iteration_word = "iteration" if iteration_num == 1 else "iterations" + summary = f"{iteration_num} {iteration_word}, {best_failed_count} failed instructions remaining" + return summary + + def _build_telemetry_entry( + self, + iteration, + cumulative_latency, + scores, + feedback, + residual_error, + failed_count, + stop_reason, + prompt, + response_text, + ): + """ + Build a structured telemetry entry for an iteration. + + Args: + iteration (int): Iteration number. + cumulative_latency (float): Total latency in milliseconds so far. + scores (dict): Evaluation scores. + feedback (list): Failed instruction feedback. + residual_error (float): Residual error score. + failed_count (int): Number of failed instructions. + stop_reason (str): Reason for stopping. + prompt (str): Prompt used for this iteration. + response_text (str): Model's response. + + Returns: + dict: Structured telemetry entry. + """ + return { + "iteration": iteration, + "cumulative_latency_ms": cumulative_latency, + "scores": scores, + "response_feedback": feedback, + "residual_error": residual_error, + "failed_instructions_count": failed_count, + "stop_reason": stop_reason, + "prompt": prompt, + "response_text": response_text, + } + + def _emit_iteration_telemetry( + self, + iteration_num, + pipeline_start, + scores, + feedback, + curr_result, + stop_reason, + curr_prompt, + curr_generated_text, + ): + """ + Build and emit telemetry for an iteration. Calculates cumulative latency. + + Args: + iteration_num (int): Current iteration number. + pipeline_start (float): Start time of the pipeline (epoch). + scores (dict): Evaluation scores. + feedback (list): Failed instruction feedback. + curr_result (object): AIMon detection result. + stop_reason (str): Reason for stopping or continuing. + curr_prompt (str): Prompt used. + curr_generated_text (str): Model response text. + + Returns: + dict: The telemetry entry. + """ + cumulative_latency_ms = self._get_cumulative_latency(pipeline_start) + + residual_error = get_residual_error_score(curr_result) if curr_result else None + failed_count = get_failed_instructions_count(curr_result) if curr_result else None + + prompt_text = curr_prompt.template + + entry = self._build_telemetry_entry( + iteration_num, + cumulative_latency_ms, + scores, + feedback, + residual_error, + failed_count, + stop_reason, + prompt_text, + curr_generated_text, + ) + try: + self.telemetry.emit(**entry) + except Exception as e: + logger.warning(f"[Warning] Telemetry emission failed: {e}") + return entry + + def _get_cumulative_latency(self, pipeline_start): + """ + Calculate cumulative latency since pipeline start. + + Args: + pipeline_start (float): Start time of the pipeline (epoch). + + Returns: + float: Cumulative latency in milliseconds. + """ + return (time.time() - pipeline_start) * 1000 + + def _record_iteration_output(self, iteration_outputs, iteration_num, generated_text, result): + """ + Record iteration outputs for later selection of the best response. + + Args: + iteration_outputs (dict): Stores outputs per iteration. + iteration_num (int): Current iteration number. + generated_text (str): Model's generated response. + result (object): AIMon detection result. + + Returns: + None + """ + iteration_outputs[iteration_num] = { + "response_text": generated_text, + "residual_error_score": get_residual_error_score(result), + "failed_instructions_count": get_failed_instructions_count(result) + } \ No newline at end of file diff --git a/aimon/reprompting_api/reprompter.py b/aimon/reprompting_api/reprompter.py new file mode 100644 index 0000000..e229621 --- /dev/null +++ b/aimon/reprompting_api/reprompter.py @@ -0,0 +1,218 @@ +from aimon.reprompting_api.utils import get_failed_instructions_count, get_failed_instructions, get_failed_toxicity_instructions +from string import Template +import logging + +logger = logging.getLogger(__name__) + +class Reprompter: + """ + Generates a template for corrective reprompting for improving LLM responses + based on AIMon evaluation results. This class combines failed instruction + feedback and background information to trigger iterative + improvement prompts for stateless LLMs. + The template is designed to accept substitutions for system_prompt, user_query, and context. + + Designed for use in open-source contexts where developers may want to + customize the prompt structure or language. + """ + + def create_corrective_prompt(self, result, aimon_payload: dict) -> Template: + """ + Build a corrective prompt **template** for the next LLM response. + + Placeholders: + {system_prompt} – The original system prompt + {user_query} – The user query + {context} – The context string + + Args: + result: AIMon detection result object. + aimon_payload (dict): Original payload containing: + - 'system_prompt' (str) + - 'user_query' (str) + - 'context' (str) + - 'generated_text' (str) + - 'instructions' (list[str]) + + Returns: + Template: A string.Template object (with placeholders for substitution). + """ + try: + failed_instructions = get_failed_instructions(result) + failed_count = get_failed_instructions_count(result) + logger.debug(f"Failed instructions ({failed_count}): {failed_instructions}") + + tone = self.determine_tone(failed_count) + toxicity_feedback = self.get_toxicity_reprompt(result) + failed_instructions_reprompt = self.format_failed_instructions(failed_instructions, toxicity_feedback) + passed_instructions = self.format_passed_instructions(self.get_passed_instructions(result, aimon_payload)) + generated_text = aimon_payload.get('generated_text', '') + + # Build template string (placeholders for substitution) + template_str = ( + "Original system prompt:\n" + "${system_prompt}\n\n" + "Revise your previous response to this query:\n" + "${user_query}\n\n" + "Context:\n" + "${context}\n\n" + "Previous response:\n" + f"{generated_text}\n\n" + f"{tone}\n\n" + f"{failed_instructions_reprompt}\n\n" + "Preserve correct content. Return only the revised output with no extra explanation.\n" + f"{passed_instructions}\n" + ) + logger.debug(f"Generated corrective prompt template:\n{template_str}") + return Template(template_str) + except Exception as e: + logger.error(f"Error generating corrective prompt: {e}") + raise RuntimeError( + f"Corrective prompt template generation failed: {type(e).__name__} — {e}" + ) from e + + def get_toxicity_reprompt(self, result) -> str: + """ + Generate feedback for detected toxicity failures in the following format: + Your reply contained toxic content. Remove any harmful, abusive, or unsafe language. + 1. We are X% confident that your response had the following issue: + → Violation: "..." + → Explanation: "..." + + Args: + result: AIMon detection result. + + Returns: + str: Toxicity-specific feedback, or None if no toxicity detected. + """ + try: + failed_instructions = get_failed_toxicity_instructions(result) + if not failed_instructions: + return "" + logger.info(f"Toxicity violations detected: {len(failed_instructions)}") + lines = ["Your reply contained toxic content. Remove any harmful, abusive, or unsafe language."] + for i, failed_instruction in enumerate(failed_instructions, start=1): + confidence = failed_instruction.get("score", 0.0) * 100 + confidence_str = f"{confidence:.2f}%" + lines.append( + f"{i}. We are {confidence_str} confident that your response had the following issue:\n" + f"→ Violation: \"{failed_instruction.get('instruction', '[Unknown]')}\"\n" + f"→ Explanation: {failed_instruction.get('explanation', '[No explanation provided]')}\n" + ) + return "\n\n".join(lines) + except Exception as e: + logger.error(f"Error generating toxicity feedback: {e}") + return "" + + def get_reprompt_per_instruction(self, failed_instruction): + """ + Corrective feedback for a single failed instruction in the following format: + 1. We are X% confident that the following instruction was not followed: + → Violated Instruction: "..." + → Explanation: "..." + + Args: + failed_instruction (dict): Failed instruction data containing: + - 'instruction' (str) + - 'score' (float) + - 'explanation' (str) + + Returns: + str: Formatted feedback for the failed instruction. + """ + try: + confidence = (1.0 - failed_instruction.get("score", 0.0)) * 100 + confidence_str = f"{confidence:.2f}%" + return ( + f" We are {confidence_str} confident that the following instruction was not followed:\n" + f"→ Violated Instruction: \"{failed_instruction.get('instruction', '[Unknown]')}\"\n" + f"→ Explanation: {failed_instruction.get('explanation', '[No explanation provided]')}\n" + ) + except Exception as e: + logger.error(f"Error formatting failed instruction: {e}") + raise RuntimeError( + f"Corrective prompt generation failed: Unexpected error of type {type(e).__name__} — {e}" + ) from e + + def format_failed_instructions(self, failed_instructions, toxicity_feedback: str = None): + """ + Combine toxicity feedback with general failed instructions into a formatted block. + + Args: + failed_instructions (list): List of failed instruction dictionaries. + toxicity_feedback (str, optional): Pre-generated toxicity feedback block. + + Returns: + str: Combined formatted feedback string. + """ + lines = [] + if toxicity_feedback: + lines.append(toxicity_feedback) + if failed_instructions: + lines.append("Fix the following:") + for i, error in enumerate(failed_instructions, start=1): + lines.append(f"{i}. {self.get_reprompt_per_instruction(error)}") + if not lines: + return "No major issues." + return "\n\n".join(lines) + + def get_passed_instructions(self, result, aimon_payload): + """ + Retrieve instructions that passed all adherence and groundedness checks. + + Args: + result: AIMon detection result. + aimon_payload (dict): Original payload containing the full instruction list. + + Returns: + list: Passed instruction strings. + """ + try: + all_instructions = aimon_payload.get("instructions", []) + failed_instructions = {item["instruction"] for item in get_failed_instructions(result)} + return [instr for instr in all_instructions if instr not in failed_instructions] + except Exception as e: + logger.error(f"Error determining passed instructions: {e}") + return [] + + def format_passed_instructions(self, passed_instructions) -> str: + """ + Format passed instructions to reinforce adherence in the next iteration. + + Args: + passed_instructions (list): Passed instruction strings. + + Returns: + str: Formatted reminder block for passed instructions. + """ + if not passed_instructions: + return "" + return ( + "You did well on these instructions. It is important that you continue to follow these instructions:\n" + + "\n".join(f"- {instr}" for instr in passed_instructions) + ) + + def determine_tone(self, failed_count: int) -> str: + """ + Decide the corrective prompt tone based on failure severity: + + if failed instructions >= 3: + Your reply had major issues. Fix all points below. + if failed instructions between 2 and 3: + Some parts were off. Improve using the notes below. + if less than 2: + Almost there! Just a few small fixes needed. + + Args: + failed_count (int): Total number of failed instructions. + + Returns: + str: Tone-setting string for the corrective prompt. + """ + if failed_count >= 3: + return "Your reply had major issues. Fix all points below." + elif failed_count >= 2: + return "Some parts were off. Improve using the notes below." + else: + return "Almost there. Just a few small fixes needed." + diff --git a/aimon/reprompting_api/runner.py b/aimon/reprompting_api/runner.py new file mode 100644 index 0000000..0116cca --- /dev/null +++ b/aimon/reprompting_api/runner.py @@ -0,0 +1,76 @@ +""" +runner.py — This module provides a high-level function (`run_reprompting_pipeline`) +for executing AIMon's iterative re-prompting workflow. + +This function is the primary entry point for developers and end-users. It: + - Normalizes inputs (replacing missing `system_prompt` or `context` with clear placeholders). + - Initializes the `RepromptingPipeline` with the provided configuration and LLM function. + - Runs the full re-prompting loop, generating an initial response, evaluating it, + and iteratively re-prompting until adherence criteria or stopping conditions are met. + +Contributors can extend this behavior by modifying `RepromptingPipeline` or `RepromptingConfig`. +""" +from typing import List, Optional +from aimon.reprompting_api.pipeline import RepromptingPipeline +from aimon.reprompting_api.config import RepromptingConfig + +def run_reprompting_pipeline( + llm_fn, + user_query: str, + system_prompt: str = None, + context:str = None, + user_instructions: List[str] = None, + reprompting_config: RepromptingConfig = None, +) -> dict: + """ + High-level wrapper for running the full AIMon re-prompting pipeline. + + This function prepares and normalizes all inputs, initializes the pipeline, + and executes the iterative re-prompting process. Missing `system_prompt` or + `context` values are replaced with clear placeholders (`"[no system prompt provided]"` + and `"[no context provided]"`) to ensure template consistency. + + Args: + llm_fn (Callable[[Template, str, str, str], str]): + A function to call the LLM. Must accept a prompt template (recommended_prompt_template), + `system_prompt`, `context`, and `user_query`. + user_query (str): + The user’s query. Must be a non-empty string. + system_prompt (str, optional): + A system-level instruction string. Defaults to `"[no system prompt provided]"` if None or empty. + context (str, optional): + Supplemental context for the LLM. Defaults to `"[no context provided]"` if None or empty. + user_instructions (List[str], optional): + A list of instructions for the model to follow. Defaults to an empty list. + reprompting_config (RepromptingConfig, optional): + Configuration object for controlling pipeline behavior. + + Returns: + dict: A structured dictionary containing: + - "best_response" (str): The final, best LLM response. + - "telemetry" (list, optional): Iteration-level telemetry if enabled in config. + - "summary" (str, optional): A human-readable summary of the process if enabled. + """ + + # Use the provided config or fall back to defaults + config = reprompting_config or RepromptingConfig() + + # validate llm_fn + if not callable(llm_fn): + raise TypeError("llm_fn must be a callable that returns a string.") + + if not user_query or not isinstance(user_query, str): + raise ValueError("user_query must be a non-empty string.") + + context = context if (context and isinstance(context, str)) else "[no context provided]" + system_prompt = system_prompt if (system_prompt and isinstance(system_prompt, str)) else "[no system prompt provided]" + + # initialize the re-prompting pipeline with the LLM function and configuration + pipeline = RepromptingPipeline(llm_fn=llm_fn, config=config) + + return pipeline.run( + system_prompt=system_prompt, + context=context, + user_query=user_query, + user_instructions=user_instructions or [] # Default to empty list if none provided + ) \ No newline at end of file diff --git a/aimon/reprompting_api/telemetry.py b/aimon/reprompting_api/telemetry.py new file mode 100644 index 0000000..a301308 --- /dev/null +++ b/aimon/reprompting_api/telemetry.py @@ -0,0 +1,75 @@ +import json +import uuid +from datetime import datetime + +class TelemetryLogger: + """ + A lightweight logger for recording telemetry events during re-prompting pipeline execution. + + Telemetry is stored in memory for retrieval and returned by the pipeline when requested. + """ + def __init__(self): + """Initialize an in-memory telemetry logger.""" + self.session_id = str(uuid.uuid4()) + self.memory_store = [] + + def emit( + self, + iteration: int, + cumulative_latency_ms: float, + scores: dict, + response_feedback: dict, + residual_error: float, + failed_instructions_count: int, + stop_reason: str, + response_text: str, + prompt: str = "", + ): + """ + Emit a single telemetry entry. + + Args: + iteration (int): The iteration number of the pipeline (starts at 1). + cumulative_latency_ms (float): Total latency from pipeline start (ms). + scores (dict): Evaluation scores (e.g., groundedness, instruction adherence). + response_feedback (dict): Feedback for failed instructions. + residual_error (int): Residual error score. + failed_instructions_count (int): Count of instructions not followed. + stop_reason (str): Reason for stopping or continuing. + response_text (str): The raw text response from the LLM. + prompt (str): The prompt text used for this iteration. + """ + telemetry = { + # not returned + "_timestamp": datetime.utcnow().isoformat() + "Z", + "_session_id": self.session_id, + # returned + "iteration": iteration, + "cumulative_latency_ms": cumulative_latency_ms, + "scores": scores, + "response_feedback": response_feedback, + "residual_error": residual_error, + "failed_instructions_count": failed_instructions_count, + "stop_reason": stop_reason, + "promp_template": prompt, + "response_text": response_text, + } + self.memory_store.append(telemetry) + + def get_all(self, include_meta=False): + """ + Return all recorded telemetry entries. + + Args: + include_meta (bool): If True, includes session_id and timestamp. Defaults to False. + + Returns: + list: Telemetry entries, stripped of internal metadata unless requested. + """ + if include_meta: + return self.memory_store + # Strip out keys starting with "_" for external return + sanitized = [] + for entry in self.memory_store: + sanitized.append({k: v for k, v in entry.items() if not k.startswith("_")}) + return sanitized diff --git a/aimon/reprompting_api/tests/_init_.py b/aimon/reprompting_api/tests/_init_.py new file mode 100644 index 0000000..e69de29 diff --git a/aimon/reprompting_api/tests/test_reprompting_cases.py b/aimon/reprompting_api/tests/test_reprompting_cases.py new file mode 100644 index 0000000..46d3079 --- /dev/null +++ b/aimon/reprompting_api/tests/test_reprompting_cases.py @@ -0,0 +1,172 @@ +import os +import pytest +from string import Template +from together import Together +from aimon.reprompting_api.config import RepromptingConfig +from aimon.reprompting_api.runner import run_reprompting_pipeline + +TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") +AIMON_API_KEY = os.environ.get("AIMON_API_KEY") + +client = Together(api_key=TOGETHER_API_KEY) + +# --- Fixtures --- + +@pytest.fixture +def my_llm(): + def _my_llm(recommended_prompt_template: Template, system_prompt, context, user_query) -> str: + filled_prompt = recommended_prompt_template.substitute( + system_prompt=system_prompt or "", + context=context or "", + user_query=user_query or "" + ) + print("====prompt===") + print(filled_prompt) + response = client.chat.completions.create( + model="mistralai/Mistral-7B-Instruct-v0.2", + messages=[{"role": "user", "content": filled_prompt}], + max_tokens=256, + temperature=0 + ) + return response.choices[0].message.content + return _my_llm + +@pytest.fixture +def base_config(): + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=2, + ) + +@pytest.fixture +def config_without_telemetry(): + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=False, + return_aimon_summary=False, + application_name="api_test", + max_iterations=2, + ) + +@pytest.fixture +def config_low_latency(): + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=2, + latency_limit_ms=100 + ) + +@pytest.fixture +def config_high_latency(): + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=3, + latency_limit_ms=5000 + ) + +@pytest.fixture +def config_iteration_limit(): + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=-1, + ) + +# --- Tests --- + +@pytest.mark.integration +def test_low_latency_limit(my_llm, config_low_latency): + result = run_reprompting_pipeline( + user_query="Test latency limit termination return", + context="Context", + llm_fn=my_llm, + reprompting_config=config_low_latency, + user_instructions=[] + ) + assert "best_response" in result + +@pytest.mark.integration +def test_latency_limit(my_llm, config_high_latency): + result = run_reprompting_pipeline( + user_query="What is context?", + context="Context", + llm_fn=my_llm, + reprompting_config=config_high_latency, + user_instructions=["Do not use the letter e", "Only use the letter e"] + ) + assert "best_response" in result + +@pytest.mark.integration +def test_iteration_limit(my_llm, config_iteration_limit): + result = run_reprompting_pipeline( + user_query="What's the policy?", + context="hi", + llm_fn=my_llm, + reprompting_config=config_iteration_limit, + user_instructions=[] + ) + assert "best_response" in result + +@pytest.mark.integration +def test_empty_context_and_instructions(my_llm, base_config): + result = run_reprompting_pipeline( + user_query="What's the policy?", + context="", + llm_fn=my_llm, + reprompting_config=base_config, + user_instructions=[] + ) + assert "best_response" in result + +@pytest.mark.integration +def test_no_telemetry(my_llm, config_without_telemetry): + result = run_reprompting_pipeline( + user_query="What's the policy?", + context="Context for telemetry disabled run", + llm_fn=my_llm, + reprompting_config=config_without_telemetry, + user_instructions=[] + ) + assert "telemetry" not in result + assert "summary" not in result + +@pytest.mark.integration +def test_no_system_prompt(my_llm, base_config): + result = run_reprompting_pipeline( + user_query="What's the policy?", + context="Context for telemetry disabled run", + llm_fn=my_llm, + reprompting_config=base_config, + user_instructions=["use the letter e only", "do not use the letter e"] + ) + assert "best_response" in result + +@pytest.mark.integration +def test_with_system_prompt(my_llm, base_config): + result = run_reprompting_pipeline( + user_query="What's the policy?", + context="Context for telemetry disabled run", + llm_fn=my_llm, + reprompting_config=base_config, + user_instructions=["use the letter e only", "do not use the letter e", "use a neutral tone"], + system_prompt="this is a system prompt" + ) + assert "best_response" in result + assert "telemetry" in result + assert "summary" in result diff --git a/aimon/reprompting_api/tests/test_reprompting_failures.py b/aimon/reprompting_api/tests/test_reprompting_failures.py new file mode 100644 index 0000000..aacc764 --- /dev/null +++ b/aimon/reprompting_api/tests/test_reprompting_failures.py @@ -0,0 +1,123 @@ +import os +import pytest +from string import Template +from together import Together +from aimon.reprompting_api.config import RepromptingConfig +from aimon.reprompting_api.runner import run_reprompting_pipeline + +TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") +AIMON_API_KEY = os.environ.get("AIMON_API_KEY") +client = Together(api_key=TOGETHER_API_KEY) + +# --- MOCKED LLM FUNCTIONS --- +def my_llm(prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: + """Simulates a normal working LLM that returns a string response.""" + filled_prompt = prompt_template.substitute( + system_prompt=system_prompt or "", + context=context or "", + user_query=user_query or "" + ) + response = client.chat.completions.create( + model="mistralai/Mistral-7B-Instruct-v0.2", + messages=[{"role": "user", "content": filled_prompt}], + max_tokens=256, + temperature=0 + ) + return response.choices[0].message.content + +def llm_fn_failure(prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: + """Simulates an LLM call that fails every time.""" + raise RuntimeError("LLM call failed intentionally for testing") + +def llm_fn_incorrect_return_value(prompt_template: Template, system_prompt=None, context=None, user_query=None): + """Simulates an LLM that returns an invalid type instead of a string.""" + return 42 + +# --- MOCKED CONFIG FACTORIES --- +def get_config(): + """Returns a valid base configuration for most tests.""" + return RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=2, + ) + +def get_config_with_invalid_aimon_api_key(): + """Returns a config with an intentionally invalid AIMon API key.""" + return RepromptingConfig( + aimon_api_key="invalid key", + publish=False, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=3, + ) + +# --- TESTS EXPECTING FAILURES --- +@pytest.mark.integration +def test_llm_failure(): + """Should raise RuntimeError when the LLM function always fails.""" + config = get_config() + with pytest.raises(RuntimeError, match="LLM call failed or returned invalid type after maximum retries."): + run_reprompting_pipeline( + user_query="Test LLM failure handling", + context="Context for failure test", + llm_fn=llm_fn_failure, + reprompting_config=config, + user_instructions=[] + ) + +@pytest.mark.integration +def test_invalid_llm_fn(): + """Should raise TypeError when LLM function is None.""" + config = get_config() + with pytest.raises(TypeError): + run_reprompting_pipeline( + user_query="Test invalid LLM fn", + context="Context for failure test", + llm_fn=None, + reprompting_config=config, + user_instructions=[] + ) + +@pytest.mark.integration +def test_invalid_return_value(): + """Should raise RuntimeError when the LLM returns a non-string value.""" + config = get_config() + with pytest.raises(RuntimeError, match="LLM call failed or returned invalid type"): + run_reprompting_pipeline( + user_query="Test invalid return type", + context="Context for type error", + llm_fn=llm_fn_incorrect_return_value, + reprompting_config=config, + user_instructions=[] + ) + +@pytest.mark.integration +def test_empty_query(): + """Empty query should raise a ValueError.""" + config = get_config() + with pytest.raises(ValueError, match="user_query must be a non-empty string"): + run_reprompting_pipeline( + user_query="", + context="", + llm_fn=my_llm, + reprompting_config=config, + user_instructions=[] + ) + +@pytest.mark.integration +def test_invalid_api_key(): + """Should fail due to invalid AIMon API key.""" + config = get_config_with_invalid_aimon_api_key() + with pytest.raises(RuntimeError): + run_reprompting_pipeline( + user_query="Testing with invalid AIMon API key", + context="Context for invalid key test", + llm_fn=my_llm, + reprompting_config=config, + user_instructions=[] + ) diff --git a/aimon/reprompting_api/tests/test_reprompting_success.py b/aimon/reprompting_api/tests/test_reprompting_success.py new file mode 100644 index 0000000..10cc66f --- /dev/null +++ b/aimon/reprompting_api/tests/test_reprompting_success.py @@ -0,0 +1,106 @@ +import os +import logging +from string import Template +from together import Together +from aimon.reprompting_api.config import RepromptingConfig +from aimon.reprompting_api.runner import run_reprompting_pipeline + +# --- Configure logging --- +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logger = logging.getLogger(__name__) + +# --- Load API keys --- +TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") +AIMON_API_KEY = os.environ.get("AIMON_API_KEY") +if not TOGETHER_API_KEY or not AIMON_API_KEY: + logger.warning("API keys are missing. Make sure TOGETHER_API_KEY and AIMON_API_KEY are set.") + +# --- Initialize Together client --- +client = Together(api_key=TOGETHER_API_KEY) + +# --- LLM Function --- +def my_llm(recommended_prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: + """ + Example LLM function that: + 1. Receives a corrective prompt template (string.Template). + 2. Substitutes placeholders (system_prompt, context, user_query). + 3. Sends to a Together-hosted LLM and returns the response. + """ + filled_prompt = recommended_prompt_template.substitute( + system_prompt=system_prompt or "", + context=context or "", + user_query=user_query or "" + ) + logger.info(f"[LLM] Sending prompt to model: {filled_prompt[:200]}...") # Log preview + + response = client.chat.completions.create( + model="mistralai/Mistral-7B-Instruct-v0.2", + messages=[{"role": "user", "content": filled_prompt}], + max_tokens=256, + temperature=0 + ) + output = response.choices[0].message.content + logger.info(f"[LLM] Received response: {output[:200]}...") + return output + +# --- Test Case: Successful Run --- +def test_successful_run(): + """ + Simulates a realistic pipeline run with: + - Complex context + - Query for simplification + - Multiple style/tone instructions + - Telemetry & summary enabled + """ + logger.info("[Pipeline] Starting test run...") + + config = RepromptingConfig( + aimon_api_key=AIMON_API_KEY, + publish=True, + return_telemetry=True, + return_aimon_summary=True, + application_name="api_test", + max_iterations=3 + ) + logger.info("[Pipeline] Config prepared.") + + user_query = "what are the drug tiers?" + context = "[SECTION] 📘 BlueShield Rx Policy Addendum: 2023–2025 ... (truncated for brevity)" + user_instructions = [ + "Avoid overly technical or robotic phrasing; keep the tone human and accessible.", + "Ensure the response is direct and professional, with minimal informal tone.", + "Translate or simplify technical details from the context into accurate, user-friendly explanations.", + "Don't use the letter e", + "only use the letter e" + ] + logger.info(f"[Pipeline] User query: {user_query}") + logger.info(f"[Pipeline] Context: {context[:100]}...") + logger.info(f"[Pipeline] Instructions: {user_instructions}") + + # Run pipeline + result = run_reprompting_pipeline( + llm_fn=my_llm, + user_query=user_query, + system_prompt="here is a system prompt", + context=context, + user_instructions=user_instructions, + reprompting_config=config + ) + + # Log each part of the result + logger.info("[Pipeline] Run complete.") + logger.info(f"[Pipeline] Best Response: {result['best_response']}") + logger.info(f"[Pipeline] Telemetry: {result.get('telemetry')}") + logger.info(f"[Pipeline] Summary: {result.get('summary')}") + + # Print outputs for inspection + print("\n=== BEST RESPONSE ===") + print(result["best_response"]) + print("\n=== TELEMETRY ===") + print(result.get("telemetry")) + print("\n=== SUMMARY ===") + print(result.get("summary")) + +# --- Entry Point --- +if __name__ == "__main__": + test_successful_run() diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py new file mode 100644 index 0000000..380ef87 --- /dev/null +++ b/aimon/reprompting_api/utils.py @@ -0,0 +1,185 @@ +""" +utils.py — Utility functions for processing AIMon reprompting detector results. + +This module provides helper functions for: +- Extracting failed instructions across instruction adherence, groundedness, and toxicity detectors. +- Calculating a residual error score (0–1) for evaluating LLM responses. + +These utilities are primarily used by the RepromptingPipeline to: +- Build telemetry. +- Select best iterations. +- Guide corrective re-prompting logic. + +Key conventions: +- Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25). +- Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures. +""" +from typing import List + +# toxicity threshold for AIMon detection; Follow probabilities above this are considered failures +TOXICITY_THRESHOLD = 0.25 + +def _count_toxicity_failures(result) -> int: + """ + Count the number of toxicity instructions whose follow probability exceeds the threshold. + + Args: + result: AIMon detection result containing a `toxicity` section. + + Returns: + int: Number of failed toxicity instructions. + """ + return sum( + 1 + for inst in result.detect_response.toxicity.get("instructions_list", []) + if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD + ) + +def toxicity_check(result) -> bool: + """ + Check whether any toxicity instructions exceed the threshold. + + Args: + result: AIMon detection result containing a `toxicity` section. + + Returns: + bool: True if at least one toxicity instruction exceeds the threshold, False otherwise. + """ + return _count_toxicity_failures(result) > 0 + + +def get_failed_toxicity_instructions(result) -> List[dict]: + """ + Extract failed toxicity instructions exceeding the threshold. + + Args: + result: AIMon detection result containing a `toxicity` section. + + Returns: + List[dict]: A list of dictionaries, each describing a failed toxicity instruction with: + - type (str): "toxicity_failure" + - source (str): "toxicity" + - instruction (str): The instruction text. + - score (float): The follow probability. + - explanation (str): The explanation for the failure. + """ + failed = [] + for inst in result.detect_response.toxicity.get("instructions_list", []): + if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD: + failed.append({ + "type": "toxicity_failure", + "source": "toxicity", + "instruction": inst.get("instruction", ""), + "score": inst.get("follow_probability", 0.0), + "explanation": inst.get("explanation", "") + }) + return failed + +def get_failed_instructions(result) -> List[dict]: + """ + Extract all failed instructions from adherence, groundedness, and toxicity detectors. + + Args: + result: AIMon detection result containing `instruction_adherence`, `groundedness`, and `toxicity` sections. + + Returns: + List[dict]: A list of failed instructions with: + - type (str): Failure type ("instruction_adherence_failure", "groundedness_failure", "toxicity_failure"). + - source (str): Detector source ("instruction_adherence", "groundedness", "toxicity"). + - instruction (str): The instruction text. + - score (float): Follow probability. + - explanation (str): Explanation for the failure. + """ + failed = [] + # Adherence & groundedness + for source in ["instruction_adherence", "groundedness"]: + for inst in getattr(result.detect_response, source, {}).get("instructions_list", []): + if not inst.get("label", True): + failed.append({ + "type": f"{source}_failure", + "source": source, + "instruction": inst.get("instruction", ""), + "score": inst.get("follow_probability", 0.0), + "explanation": inst.get("explanation", "") + }) + # Sort by score (most confident first) + failed.sort(key=lambda x: x["score"], reverse=True) + return failed + +def get_failed_instructions_count(result) -> int: + """ + Count all failed instructions across adherence, groundedness, and toxicity. + + Args: + result: AIMon detection result containing `instruction_adherence`, `groundedness`, and `toxicity` sections. + + Returns: + int: Total number of failed instructions. + """ + count = 0 + # Instruction adherence + for inst in result.detect_response.instruction_adherence.get("instructions_list", []): + if not inst.get("label", True): + count += 1 + # Groundedness + for inst in result.detect_response.groundedness.get("instructions_list", []): + if not inst.get("label", True): + count += 1 + count += _count_toxicity_failures(result) # Toxicity + return count + +def get_residual_error_score(result): + """ + Compute a normalized residual error score (0–1) based on: + - Groundedness follow probabilities + - Instruction adherence follow probabilities + - Toxicity failures (adds a strong penalty) + + Logic: + 1. Compute a base penalty using groundedness & adherence: + - Each instruction's penalty = (1 - p), doubled if p < 0.5. + - Average across all instructions for a base score. + 2. Add a flat toxicity penalty (+0.3) if any toxicity failures exist. + 3. Clamp the final score to [0,1]. + + Args: + result: AIMon detection result with `instruction_adherence`, `groundedness`, and `toxicity` sections. + + Returns: + float: Residual error score (0 = perfect, 1 = worst). The float is rounded to two decimal places. + """ + combined_probs = [ + item["follow_probability"] + for source in ["groundedness", "instruction_adherence"] + for item in getattr(result.detect_response, source, {}).get("instructions_list", []) + ] + base_penalty = penalized_average(combined_probs) if combined_probs else 0.0 + + toxicity_penalty = 0.3 if _count_toxicity_failures(result) > 0 else 0.0 + + residual_error_score = base_penalty + toxicity_penalty + residual_error_score = min(1.0, max(0.0, residual_error_score)) + + return round(residual_error_score, 2) + + +def penalized_average(probs: List[float]) -> float: + """ + Compute a penalized average of follow probabilities. + + Penalizes probabilities <0.5 more heavily by doubling their penalty. + + Args: + probs (List[float]): A list of follow probabilities. + + Returns: + float: Penalized average. + """ + penalties = [] + for p in probs: + if p >= 0.5: + penalty = 1 - p + else: + penalty = (1 - p) * 2 # heavier penalty + penalties.append(penalty) + return sum(penalties) / len(penalties) \ No newline at end of file From 904320ed281ac1f9fdef19fdf786b5b15d29be4d Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Tue, 29 Jul 2025 17:43:32 -0700 Subject: [PATCH 02/10] updating tests_reprompting_cases.py to contain different queries and contexts --- .../tests/test_reprompting_cases.py | 65 ++++++++++++++----- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/aimon/reprompting_api/tests/test_reprompting_cases.py b/aimon/reprompting_api/tests/test_reprompting_cases.py index 46d3079..89587b6 100644 --- a/aimon/reprompting_api/tests/test_reprompting_cases.py +++ b/aimon/reprompting_api/tests/test_reprompting_cases.py @@ -14,20 +14,23 @@ @pytest.fixture def my_llm(): + """Mock LLM function for integration tests. Prints prompts and responses.""" def _my_llm(recommended_prompt_template: Template, system_prompt, context, user_query) -> str: filled_prompt = recommended_prompt_template.substitute( system_prompt=system_prompt or "", context=context or "", user_query=user_query or "" ) - print("====prompt===") - print(filled_prompt) + print("\n==== LLM PROMPT SENT ====", flush=True) + print(filled_prompt, flush=True) response = client.chat.completions.create( model="mistralai/Mistral-7B-Instruct-v0.2", messages=[{"role": "user", "content": filled_prompt}], max_tokens=256, temperature=0 ) + print("\n==== LLM RAW RESPONSE ====", flush=True) + print(response.choices[0].message.content, flush=True) return response.choices[0].message.content return _my_llm @@ -88,85 +91,111 @@ def config_iteration_limit(): max_iterations=-1, ) +# --- Helper to print results nicely --- +def print_result(test_name, result): + print(f"\n===== RESULTS FOR: {test_name} =====", flush=True) + print("\n==== BEST RESPONSE ====", flush=True) + print(result.get("best_response"), flush=True) + print("\n==== TELEMETRY ====", flush=True) + print(result.get("telemetry"), flush=True) + print("\n==== SUMMARY ====", flush=True) + print(result.get("summary"), flush=True) + print("===== END OF RESULT =====\n", flush=True) + # --- Tests --- @pytest.mark.integration def test_low_latency_limit(my_llm, config_low_latency): + """Test stopping behavior when latency limit is very low (100ms).""" result = run_reprompting_pipeline( - user_query="Test latency limit termination return", - context="Context", + user_query="We just received a Cal/OSHA citation for emergency workplace safety violations with a 15-day correction deadline. What are our options to appeal or resolve this without facing business closure?", + context="(Form DOSH-TRN-2025) [SECTION] Section 4: Consequences of Non-Compliance [SECTION] Failing to act may result in: [SECTION] - Daily penalties of $500 per violation after deadline [SECTION] - Business closure orders for willful violations [SECTION] - 300% penalty enhancements for repeat offenses [SECTION] Section 5: Recommended Resolution Path [SECTION] For fastest resolution: [SECTION] 1. Correct all hazards within 10 days [SECTION] 2. Submit Form DOSH-RESP-2025 with evidence [SECTION] 3. Request compliance verification inspection [SECTION] Remember: Appeal filings don’t suspend correction deadlines—address hazards immediately while preserving your appeal rights.[SECTION] California Business Emergency Preparedness Compliance Guide – 2025 Edition [SECTION] Document Ref: CA-BEP-2025-09 / Effective July 1, 2025 [SECTION] Section 1: Mandatory Earthquake Preparedness Protocol [SECTION] All California businesses with 10+ employees must maintain an approved earthquake readiness plan under CA Labor Code §6401.7. Don't worry—we'll walk you through each requirement step by step. [SECTION] 1. Plan Submission: [SECTION] - Complete Form BEP-22 (Earthquake Preparedness Certification) [SECTION] - Submit via CalOES Business Portal or mail to: [SECTION] Office of Emergency Services [SECTION] Business Compliance Division [SECTION] P.O. Box 419047 [SECTION] Sacramento, CA 95841 [SECTION] - Deadline: Within 30 days of plan creation or update [SECTION] 2. Employee Training: [SECTION] - Conduct quarterly drills using state-approved materials (Reference Guide BEP-TM-2025) [SECTION] - Maintain signed attendance records (Form BEP-23) for 3 years [SECTION] - New hires must complete training within 14 days of employment [SECTION] 3. Emergency Supplies: [SECTION] - Minimum 3-day water supply (1 gallon per person per day) [SECTION] - First aid kits meeting ANSI/ISEA Z308.1-2025 standards [SECTION] - Emergency lighting for all exits (tested monthly) [SECTION] - Battery-powered NOAA weather radio [SECTION] Section 2: Compliance Verification Process [SECTION] To schedule verification: [SECTION] 1. Create account on CalOES Business Portal (portal.caloes.ca.gov) [SECTION] 2. Submit Inspection Request Form BEP-30 [SECTION] 3. Pay verification fee: [SECTION] - $150 standard fee [SECTION] - $75 for businesses with 10-24 employees [SECTION] - Waived for non-profits with Form BEP-31 [SECTION] 4. Prepare for inspection: [SECTION] - Printed copy of Form BEP-22 submission receipt [SECTION] - Training records (Forms BEP-23 and BEP-23A) [SECTION] - Current supply inventory log (Form BEP-24) [SECTION] Section 3: Non-Compliance Penalties [SECTION] - First violation: $500 fine + mandatory corrective action plan (Form BEP-40) [SECTION] - Subsequent violations: $1,000 per occurrence [SECTION] - Willful neglect: Up to $5,000 + business license suspension [SECTION] - False certification: Misdemeanor charge under CA Business Code §17200 [SECTION] Section 6: Penalties for Non-Compliance [SECTION] Failure to properly report may result in: [SECTION] - Fines up to $25,000 per violation under Labor Code §6429. [SECTION] - Business license suspension for repeat offenses. [SECTION] - Potential criminal charges for willful violations causing harm. [SECTION] Section 7: Additional Resources [SECTION] - Cal/OSHA Spill Response Guidelines: dir.ca.gov/dosh/chemical-spills.html [SECTION] - 24/7 Emergency Consultation: 1-800-963-9424 [SECTION] - Small Business Compliance Assistance: 1-800-400-7115[SECTION] California Business Energy Efficiency Installation Guide – 2025 Edition [SECTION] ## Section 1: Understanding Your CBESP Installation Requirements [SECTION] You're asking about installing energy efficiency equipment under the CBESP program—let's walk through this step by step. The process is straightforward when you follow these guidelines. We recommend starting with a certified site assessment to maximize your benefits. [SECTION] ## Section 2: Pre-Installation Checklist [SECTION] Before installation begins: [SECTION] 1. Verify Equipment Eligibility: Check the CBESP-approved equipment list (available at cbesp.ca.gov/equipment) for your business category. For example, commercial kitchens must use NSF-certified refrigeration units. [SECTION] 2. Schedule Site Assessment: Contact a CBESP-certified assessor within 5 business days—this helps identify the best equipment placement and potential rebates. [SECTION] 3. Submit Installation Plan: Complete Form CBESP-IP-2025 with details like equipment specifications and safety protocols. [SECTION] ## Section 3: Installation Protocol [SECTION] Follow these key steps: [SECTION] 1. Power Isolation: Shut off circuits following standard safety procedures (typically 1-2 hours for most facilities). [SECTION] 2. Equipment Mounting: Use manufacturer-specified brackets with proper bracing in seismic zones. [SECTION] 3. Wiring Compliance: Follow all electrical safety requirements—consult an electrician if unsure. [SECTION] ## Section 4: Post-Installation Process [SECTION] 1. Functional Testing: Run diagnostic cycles for 24-48 hours to confirm everything works properly. [SECTION] 2. Documentation: Submit Form ICF-2025 within 3 business days of completion. [SECTION] 3. Inspector Scheduling: Request CBESP verification within 14 calendar days to avoid reinspection fees. [SECTION] ## Section 5: Important Deadlines [SECTION] - Assessment scheduling: Within 5 business days of equipment purchase [SECTION] - Form submission: 3 business days post-installation [SECTION] - Verification request: Within 14 calendar days of completion [SECTION] ## Section 6: Non-Compliance Consequences [SECTION] Not following procedures may result in:SECTION] 3. Referral to the California Franchise Tax Board for collection [SECTION] Section 5: Urgent Support Resources [SECTION] For time-sensitive applications: [SECTION] 1. Call the Disaster Relief Hotline: 1-800-CA-BIZ-AID (option 2 for expedited processing) [SECTION] 2. Email emergency@ca-bfa.gov with subject line URGENT: [Your Business Name] [SECTION] 3. Visit designated disaster recovery centers (list at ca-bfa.gov/locations) [SECTION] Note: Applications submitted after the 14-day window will be processed under standard timelines (4–6 weeks).[SECTION] California Business Licensing Compliance Guide – 2025 Edition [SECTION] Document Ref: CA-BLC-2025-12 / Effective July 1, 2025 [SECTION] Section 1: Renewal Notice Verification [SECTION] 1. Submit Form BLC-ADDR-2025 through our online portal or by mail to verify your address on file. Processing takes 3 business days. [SECTION] 2. If your address is correct but the notice hasn't arrived, request a duplicate notice by calling 1-800-CA-BIZLIC or visiting any Regional Business License Center. [SECTION] 3. For urgent renewals, submit Form BLC-RENEW-2025 with a written explanation of the missing notice. [SECTION] Section 2: Required Renewal Documentation [SECTION] To complete your renewal, you must provide: [SECTION] 1. Form BLC-RENEW-2025 with Sections A, C, and E completed [SECTION] 2. Your current business license number from prior correspondence [SECTION] 3. Proof of local zoning compliance (Form ZON-2025 or equivalent) [SECTION] Section 3: Late Renewal Procedures [SECTION] Renewals submitted within 30 days of expiration require: [SECTION] 1. Payment of $50 late fee via Form BLC-LATE-2025 [SECTION] 2. Submission of all standard renewal documents [SECTION] 3. Written justification for delay [SECTION] After 30 days, you must submit Form BLC-NEW-2025 for full reapplication. [SECTION] Section 4: Operating Without Renewal [SECTION] Continuing business operations with an expired license may result in: [SECTION] 1. Fines of $500 per violation [SECTION] 2. Mandatory compliance review (Form BLC-COMP-2025) [SECTION] 3. Potential license suspension for repeated violations", llm_fn=my_llm, reprompting_config=config_low_latency, - user_instructions=[] + user_instructions=["Use sentence structures that sound natural in spoken English, avoiding overly formal or stilted constructions.","Do not suggest actions that bypass or undermine official company processes (e.g., “You could just submit a new request under a different name”); mentioning formal appeal or exception procedures is acceptable if supported by context."] ) + print_result("Low Latency Limit Test (100ms)", result) assert "best_response" in result @pytest.mark.integration def test_latency_limit(my_llm, config_high_latency): + """Test behavior with a high latency limit and contradictory instructions.""" result = run_reprompting_pipeline( - user_query="What is context?", - context="Context", + user_query="I manage data compliance for a healthcare startup using CloudSync's Enterprise tier. We need to ensure all patient-related files are automatically deleted after 7 years to meet HIPAA requirements, while maintaining the ability to recover accidentally deleted files for at least 30 days. Can CloudSync support this workflow, and what configuration changes would we need to make?", + context="[SECTION] # Data Retention and Privacy Policy for CloudSync Services [SECTION] ## Overview [SECTION] This document outlines the data retention, storage, and privacy practices for CloudSync, a cloud-based file synchronization service. It applies to all users of the Free, Pro, and Enterprise tiers unless superseded by a signed Enterprise Agreement. [SECTION] ## Data Retention Periods [SECTION] - **Active Accounts:** User data is retained indefinitely unless manually deleted by the user or via automated compliance workflows (e.g., legal hold expiration). [SECTION] - **Inactive Accounts:** Accounts with no login activity for 24 months are flagged for review. After 30 days in review status, data is permanently deleted unless an extension is requested via Form CS-102. [SECTION] - **Deleted Files:** Files removed from the trash bin are retained in a recoverable state for 30 days. After this period, they are irreversibly purged from all systems. [SECTION] ## Privacy Controls [SECTION] Users can configure: [SECTION] - Automatic deletion of files older than a specified age (1, 3, or 5 years) [SECTION] - Geographic restrictions for data storage (US, EU, or APAC regions) [SECTION] - Access logs retention (default 90 days, extendable to 365 days for Enterprise) [SECTION] ## Legal and Compliance [SECTION] - Data may be retained beyond standard periods if subject to litigation hold, subpoena, or regulatory investigation. Users will be notified unless prohibited by law. [SECTION] - Third-party audits are conducted annually for SOC 2 Type II compliance. Reports available upon request for Enterprise customers. [SECTION] - Data sovereignty requirements are enforced at the storage layer; metadata may transit through global networks for synchronization purposes. [SECTION] ## Data Recovery [SECTION] - Enterprise accounts can request point-in-time recovery for entire accounts (14-day window) or individual files (30-day window). [SECTION] - Recovery operations incur a $150 base fee plus $0.03/GB for data restored. [SECTION] ## Policy Changes [SECTION] - Notice of material changes will be provided 60 days in advance via email and in-app notifications. [SECTION] - Continued use of the service after changes take effect constitutes acceptance of the new terms.", llm_fn=my_llm, reprompting_config=config_high_latency, user_instructions=["Do not use the letter e", "Only use the letter e"] ) + print_result("High Latency Limit Test (5000ms)", result) assert "best_response" in result @pytest.mark.integration def test_iteration_limit(my_llm, config_iteration_limit): + """Test behavior when max_iterations is unlimited (-1).""" result = run_reprompting_pipeline( - user_query="What's the policy?", - context="hi", + user_query="Our systems are showing vulnerability alerts but we can't find the patch file in the vendor portal. What should we do?", + system_prompt="Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)", + context="[SECTION] # Emergency Software Patch Installation Guide [SECTION] ## Critical Security Patch Notification – CVE-2025-1234 [SECTION] This notice applies to all systems running versions prior to 3.2.1. Immediate action is required under Cybersecurity Policy CP-2025-07 to address authentication vulnerabilities. [SECTION] ## Patch Acquisition Procedures [SECTION] 1. Verify your system meets the requirements using Form LIC-VER-15 [SECTION] 2. Access the Emergency Patch Portal at epp.vendor.com/alert/CVE-2025-1234 [SECTION] 3. If the patch isn't available, submit Form PATCH-REQ-22 for manual distribution [SECTION] ## Installation Process [SECTION] We understand urgent updates can be stressful—here's how to proceed safely: [SECTION] 1. First, create a complete system backup [SECTION] 2. Run the patch installer with administrator privileges [SECTION] 3. Check the system logs to confirm successful installation [SECTION] ## Post-Installation Steps [SECTION] - Submit Form EP-ACK-22 within 24 hours [SECTION] - Retain installation records for 90 days [SECTION] - Schedule a security scan within 7 days [SECTION] ## Support Options [SECTION] - 24/7 Technical Support: 1-800-PATCH-HELP [SECTION] - Priority assistance: Submit Form IRF-89 with CRITICAL flag", llm_fn=my_llm, reprompting_config=config_iteration_limit, - user_instructions=[] + user_instructions=["do not use the letter e","only use the letter e"] ) + print_result("Iteration Limit Test (-1 = unlimited)", result) assert "best_response" in result @pytest.mark.integration def test_empty_context_and_instructions(my_llm, base_config): + """Ensure pipeline works with no context, instructions, or system prompt.""" result = run_reprompting_pipeline( - user_query="What's the policy?", + user_query="Testing with empty context, instructions, and system prompt", context="", llm_fn=my_llm, reprompting_config=base_config, user_instructions=[] ) + print_result("Empty Context & Instructions Test", result) assert "best_response" in result @pytest.mark.integration def test_no_telemetry(my_llm, config_without_telemetry): + """Confirm telemetry and summary are excluded when disabled in config.""" result = run_reprompting_pipeline( - user_query="What's the policy?", - context="Context for telemetry disabled run", + user_query="I keep getting 401 errors when trying to connect to your API. What should I check first?", + context="[SECTION] API Integration Troubleshooting Guide – Version 2.1 [SECTION] Document Ref: API-TS-2025-07 / Issued March 2025 [SECTION] Step 1: Verify Authentication Details [SECTION] To resolve 401 errors, first check these key items: [SECTION] - Ensure your API key has exactly 32 characters [SECTION] - Confirm the key is active in your Developer Portal account [SECTION] - Check that your IP address is whitelisted if required [SECTION] Step 2: Gather Required Information [SECTION] For support cases, prepare: [SECTION] - Screenshot of the error message [SECTION] - Recent API call logs [SECTION] - Your account ID and integration details [SECTION] Step 3: Submit Support Request [SECTION] You can submit your request through: [SECTION] - The Developer Portal ticket system (fastest response) [SECTION] - Email to api-support@company.com with 401 Error in subject [SECTION] Typical response time is 2 business days. [SECTION] Step 4: After Resolution [SECTION] Once fixed: [SECTION] - Update your integration settings [SECTION] - Keep records of the troubleshooting process [SECTION] For immediate help: [SECTION] Call 1-800-API-HELP (24/7 for Priority customers) [SECTION] Email: api-support@company.com[SECTION] API Rate Limit and Throttling Policy – 2025 Update [SECTION] Document Ref: API-POL-2025-07 / Effective March 2025 [SECTION] Section 1: Standard Rate Limits [SECTION] The following rate limits apply to all API endpoints unless otherwise specified in your service tier agreement: [SECTION] - Free Tier: 100 requests per minute, 1,000 requests per day [SECTION] - Business Tier: 500 requests per minute, 10,000 requests per day [SECTION] - Enterprise Tier: Custom limits negotiated per contract [SECTION] Section 2: Throttling Behavior [SECTION] When limits are exceeded: [SECTION] 1. First violation: API returns HTTP 429 (Too Many Requests) with Retry-After header [SECTION] 2. Subsequent violations within 24 hours: Temporary suspension for 1 hour [SECTION] 3. Chronic violations (3+ in 7 days): Account review and potential permanent rate reduction [SECTION] Section 3: Best Practices for Avoiding Throttling [SECTION] To maintain optimal API performance: [SECTION] 1. Implement exponential backoff when receiving 429 responses [SECTION] 2. Cache responses where possible (ETag headers supported on all GET endpoints) [SECTION] 3. Use batch endpoints instead of individual calls for bulk operations [SECTION] 4. Monitor usage via the X-RateLimit-Remaining header [SECTION] Section 4: Consequences of Policy Violations [SECTION] Repeated throttling may result in: [SECTION] 1. Temporary API key revocation [SECTION] 2. Mandatory migration to higher service tier [SECTION] 3. Suspension of account privileges pending review [SECTION] Section 5: Monitoring and Alerts [SECTION] Configure usage alerts through: [SECTION] 1. Dashboard notifications (available in Account Settings) [SECTION] 2. Webhook integrations (documented in API Guide Section 12.4) [SECTION] 3. Email warnings at 75% and 90% of daily limits[SECTION] API Rate Limit and Throttling Policy – Version 2.1 [SECTION] Effective Date: March 2025 [SECTION] This document outlines the rate limits, throttling policies, and escalation procedures for the Enterprise API tier. All API calls are subject to these limits unless otherwise specified in a signed Service Level Agreement (SLA). [SECTION] ## Rate Limit Tiers [SECTION] - **Standard Tier:** 1,000 requests per minute (RPM) across all endpoints [SECTION] - **High-Capacity Tier:** 5,000 RPM, available for an additional $200/month fee [SECTION] - **Burst Capacity:** Temporary spikes up to 2x your tier limit for 5-minute intervals, max twice per hour [SECTION] ## Throttling Behavior [SECTION] When limits are exceeded: [SECTION] - First violation: API returns HTTP 429 with Retry-After header (typically 30 seconds) [SECTION] - Repeated violations within 1 hour: 15-minute cool-down period enforced [SECTION] - Chronic violations (3+ incidents/day): Account review and potential downgrade to Standard Tier [SECTION] ## Urgent Limit Increase Process [SECTION] Follow these steps to resolve your rate limit issue: [SECTION] 1. **Submit Request:** Log into the API Dashboard and navigate to Manage Quotas [SECTION] 2. **Select Priority:** Choose Urgent and provide: [SECTION] - Business justification for the increase [SECTION] - Expected call volume during peak hours [SECTION] - Required duration (maximum 72 hours) [SECTION] 3. **Payment:** Submit the $75 expedited processing fee [SECTION] 4. **Activation:** Approved increases take effect within 2 hours of submissionSECTION] ## Consequences of Not Following Process [SECTION] - Unapproved workarounds may trigger account suspension [SECTION] - Repeated urgent requests may require SLA upgrade [SECTION] - All limit changes are logged and audited [SECTION] ## Recommended Action [SECTION] For immediate relief while waiting for approval: [SECTION] - Implement client-side retry logic with exponential backoff [SECTION] - Distribute calls across multiple API keys if available [SECTION] - Schedule non-critical requests during off-peak hours [SECTION] ## Support Resources [SECTION] - Emergency Support: api-support@example.com (Subject: URGENT - Rate Limit) [SECTION] - Real-time Monitoring: api.example.com/status [SECTION] - Documentation: docs.example.com/api/rate-limits [SECTION] ## Note on Eligibility [SECTION] Temporary increases are granted based on system capacity and historical usage patterns. Approval is not guaranteed for accounts with frequent violation history.[SECTION] API Rate Limit and Throttling Policy – 2025 Update [SECTION] Document Ref: API-POL-2025-07 / Effective March 2025 [SECTION] Section 1: Standard Rate Limits [SECTION] The following rate limits apply to all API endpoints unless otherwise specified in your service tier agreement: [SECTION] - Free Tier: 100 requests per minute, 1,000 requests per day [SECTION] - Business Tier: 500 requests per minute, 10,000 requests per day [SECTION] - Enterprise Tier: Custom limits negotiated per contract [SECTION] Section 2: Throttling Behavior [SECTION] When limits are exceeded: [SECTION] 1. First violation: API returns HTTP 429 (Too Many Requests) with Retry-After header [SECTION] 2. Subsequent violations within 24 hours: Temporary suspension for 1 hour [SECTION] 3. Chronic violations (3+ in 7 days): Account review and potential permanent rate reduction [SECTION] Section 3: Best Practices for Avoiding Throttling [SECTION] To maintain optimal API performance: [SECTION] 1. Implement exponential backoff when receiving 429 responses [SECTION] 2. Cache responses where possible (ETag headers supported on all GET endpoints) [SECTION] 3. Use batch endpoints instead of individual calls for bulk operations [SECTION] 4. Monitor usage via the X-RateLimit-Remaining header [SECTION] Section 4: Consequences of Policy Violations [SECTION] Repeated throttling may result in: [SECTION] 1. Temporary API key revocation [SECTION] 2. Mandatory migration to higher service tier [SECTION] 3. Suspension of account privileges pending review [SECTION] Section 5: Monitoring and Alerts [SECTION] Configure usage alerts through: [SECTION] 1. Dashboard notifications (available in Account Settings) [SECTION] 2. Webhook integrations (documented in API Guide Section 12.4) [SECTION] 3. Email warnings at 75% and 90% of daily limits[SECTION] API Rate Limit Enforcement Policy – California Department of Technology (2025 Revision) [SECTION] Document Ref: CDT-API-2025-09 / Effective March 1, 2025 [SECTION] Section 1: Rate Limit Thresholds [SECTION] All API endpoints enforce the following limits per client IP: [SECTION] - Standard Tier: 100 requests per minute [SECTION] - Elevated Tier: 500 requests per minute (requires Form API-T2 submitted 5 business days in advance) [SECTION] - Emergency Tier: 1,000 requests per minute (requires Form API-EMG with justification; valid for 72 hours) [SECTION] Section 2: Violation Consequences [SECTION] Exceeding rate limits triggers these automated responses: [SECTION] 1. First violation: HTTP 429 response with Retry-After header (60 seconds) [SECTION] 2. Second violation within 24 hours: 15-minute suspension [SECTION] 3. Third violation within 7 days: Account review and potential permanent blacklisting [SECTION] Section 3: Immediate Resolution Steps [SECTION] If your API access is suspended: [SECTION] 1. Check your request logs for spikes using CDT API Monitor (Form API-MON required for access) [SECTION] 2. Implement exponential backoff with jitter in your client code [SECTION] 3. Submit Form API-RES with: [SECTION] a. Incident timeline [SECTION] b. Corrective action plan [SECTION] c. Client IP ranges needing whitelisting [SECTION] Section 4: Permanent Blacklist Appeals [SECTION] To contest a blacklisting decision: [SECTION] 1. File Form API-APL within 10 business days [SECTION] 2. Provide technical documentation proving compliance with: [SECTION] a. California Code §11546.45 (API Fair Use) [SECTION] b. CDT Technical Bulletin 2025-7 (Throttling Best Practices) [SECTION] 3. Await review by the API Governance Board (5-10 business days) [SECTION] Note: Emergency service applications may qualify for expedited review if submitting Form API-EMG with wildfire response documentation.[SECTION] API Rate Limit Policy – Enterprise Developer Portal [SECTION] Document Ref: API-POL-2025-03 / Effective June 2025 [SECTION] Section 1: Understanding Your Rate Limit Issue [SECTION] Your application hitting rate limits during peak hours is a common challenge. Let's walk through the steps to resolve this while maintaining API access. [SECTION] Section 2: Immediate Actions [SECTION] 1. Review your API-LOG-78 reports to identify: [SECTION] * Which endpoints are exceeding limits [SECTION] * Time patterns of high traffic [SECTION] 2. Implement request throttling per Technical Bulletin API-TB-104 [SECTION] 3. For temporary relief, submit Form API-RL-22 for a 72-hour limit increase [SECTION] Section 3: Long-Term Solutions [SECTION] To permanently increase your rate limits: [SECTION] 1. Submit Form API-HC-45 with: [SECTION] * Business justification for higher limits [SECTION] * 30 days of API-LOG-78 reports [SECTION] 2. Our team will review within 5 business days [SECTION] Section 4: Best Practices [SECTION] * Spread high-volume requests evenly throughout the day [SECTION] * Cache responses where possible [SECTION] * Consider upgrading to Enterprise Tier (5,000 RPM) [SECTION] Section 5: What to Avoid [SECTION] Repeated violations may lead to: [SECTION] * Temporary API key suspension [SECTION] * Mandatory compliance review (Form API-COMP-19) [SECTION] * Service tier downgrade [SECTION] Section 6: Recommended Next Steps [SECTION] We recommend starting with Form API-RL-22 for immediate relief while preparing your materials for a permanent tier upgrade. Our support team is available to review your API-LOG-78 reports if needed.[SECTION] API Service Level Agreement – Version 2.1 [SECTION] Effective Date: March 15, 2025 [SECTION] Section 1: Service Availability Standards [SECTION] The API maintains 99.9% monthly uptime excluding scheduled maintenance. Downtime incidents exceeding 30 consecutive minutes qualify for service credit compensation under Section 7. [SECTION] Section 2: Incident Reporting Protocol [SECTION] To report 503 errors: [SECTION] 1. Document the incident with: [SECTION] - Exact timestamps (UTC) [SECTION] - Affected endpoint URLs [SECTION] - HTTP response headers [SECTION] 2. Complete Form API-INC-2025 (available in Developer Portal > Support) [SECTION] - Attach redacted error logs [SECTION] - Include business impact assessment [SECTION] 3. Submit within 60 minutes of first occurrence for priority handling [SECTION] Section 3: Investigation Timeline [SECTION] Upon submission: [SECTION] 1. Initial response within 15 minutes (email confirmation with Case ID) [SECTION] 2. Severity assessment using Priority Matrix API-PM-2025 within 30 minutes [SECTION] 3. Hourly status updates posted to Case ID portal [SECTION] Section 4: Resolution Procedures [SECTION] For confirmed outages: [SECTION] 1. Emergency patch deployment within 4 hours for Severity 1 incidents [SECTION] 2. Full root cause analysis report within 3 business days [SECTION] 3. Post-mortem review available upon request (Form API-PM-2025) [SECTION] Section 5: Compensation Policy [SECTION] Service credits apply as follows: [SECTION] - 5% of monthly fee for 30-59 minutes downtime [SECTION] - 10% for 1-2 hours [SECTION] - 15% for 2-4 hours [SECTION] Note: Credits require submission of Form API-CR-2025 within 7 calendar days. [SECTION] Section 6: Penalties for False Reports [SECTION] Misrepresented incidents may result in: [SECTION] - Suspension of incident reporting privileges for 30 days [SECTION] - Forfeiture of accrued service credits [SECTION] - Administrative fee of $250 per invalid claim [SECTION] Section 7: Emergency Contacts [SECTION] For unresolved Severity 1 incidents after 2 hours: [SECTION] 1. Primary: api-emergency@company.com (monitored 24/7) [SECTION] 2. Secondary: +1-800-555-API1 (follow voice prompts for engineer dispatch) [SECTION] 3. Escalation: File Form API-ESC-2025 with VP of Engineering CC [SECTION] Section 8: Preventive Measures [SECTION] Recommended best practices: [SECTION] - Implement exponential backoff with jitter [SECTION] - Monitor status.company.com for real-time updates [SECTION] - Maintain fallback endpoints per API-FB-2025 guidelines[SECTION] API Usage and Rate Limit Policy – Version 2025.1 [SECTION] We understand hitting rate limits can disrupt your workflow—let's review your options to resolve this. [SECTION] SECTION 1: CURRENT LIMITS AND UPGRADE PATHS [SECTION] Your Free Tier currently allows: [SECTION] - 100 requests per minute [SECTION] - 5,000 requests per day [SECTION] To increase these limits, consider: [SECTION] Option 1: Basic Tier ($49/month) [SECTION] - 500 requests per minute [SECTION] - 25,000 requests per day [SECTION] - Immediate activation via Developer Portal > Billing [SECTION] Option 2: Enterprise Tier [SECTION] - Custom limits tailored to your needs [SECTION] - Requires signed agreement (Form API-ENT-2025) [SECTION] - Contact sales@company.com for consultation [SECTION] SECTION 2: TEMPORARY LIMIT INCREASE [SECTION] If you need short-term relief: [SECTION] 1. Submit Form API-LIMIT-REQ through Developer Portal [SECTION] 2. Provide: [SECTION] - Technical justification (e.g., expected traffic spikes) [SECTION] - Duration needed (max 30 days) [SECTION] 3. Typical approval time: 2 business days [SECTION] SECTION 3: AVOIDING FUTURE ISSUES [SECTION] Best practices we recommend: [SECTION] - Implement exponential backoff (sample code in Dev Docs Section 4.3) [SECTION] - Cache responses using ETag headers [SECTION] - Monitor usage via Dashboard > API Analytics [SECTION] SECTION 4: IMPORTANT NOTES [SECTION] - Repeated violations (5+/month) may trigger account review [SECTION] - Emergency after-hours support: Submit Priority Ticket with 'Rate Limit' tag [SECTION] - Full policy details available in Document API-POL-2025.1 [SECTION] SECTION 5: IMMEDIATE HELP [SECTION] For urgent assistance: [SECTION] - Live chat: Developer Portal > Support [SECTION] - Phone: 1-800-555-API1 (Mon-Fri 9AM-5PM ET) [SECTION] Remember: Our team is here to help you scale efficiently while maintaining system stability for all users", llm_fn=my_llm, reprompting_config=config_without_telemetry, - user_instructions=[] + user_instructions=["Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)","Provide a detailed, ordered explanation of a process with at least three sequential steps.","Avoid expressions of uncertainty about company policies and eliminate vague or speculative phrases (e.g., “I think we cover that”)."] ) + print_result("No Telemetry Test", result) assert "telemetry" not in result assert "summary" not in result @pytest.mark.integration def test_no_system_prompt(my_llm, base_config): + """Test behavior when system prompt is excluded.""" result = run_reprompting_pipeline( - user_query="What's the policy?", - context="Context for telemetry disabled run", + user_query="What penalties can credit bureaus face if they don’t fix errors on my report, and how do I make sure they take my dispute seriously?", + context="[SECTION] California Credit Reporting & Dispute Resolution Policy – 2025 Update [SECTION] Document Ref: CA-CRDP-2025-09 / Effective January 1, 2025 [SECTION] Section 1: Credit Bureau Responsibilities [SECTION] Credit reporting agencies must correct errors on your report within 30 days. If they fail to do so, they may face penalties under state and federal law, including fines and required corrections. [SECTION] Section 2: How to File a Dispute [SECTION] Follow these steps to ensure your dispute is processed correctly: [SECTION] 1. Get your credit reports from all three bureaus using AnnualCreditReport.com or Form CR-REQUEST-2025 [SECTION] 2. Complete Form CA-DISPUTE-9, available on the DFPI website, including: [SECTION] • Your personal information in Section 3A [SECTION] • Details about each error in Section 4B [SECTION] • Supporting documents like bank statements [SECTION] 3. Submit your dispute online through the secure portal or by certified mail [SECTION] Section 3: What Happens Next [SECTION] After you file: [SECTION] 1. You'll receive a confirmation letter within 5 business days [SECTION] 2. The bureau will investigate and send results within 30 days [SECTION] 3. They must either correct the error, verify it's accurate, or remove the item [SECTION] Section 4: If the Error Isn't Fixed [SECTION] If the bureau doesn't correct a verified error: [SECTION] 1. File a complaint with the DFPI within 60 days using Form DFPI-CR-7 [SECTION] 2. Contact the Federal Trade Commission for assistance [SECTION] 3. You may have the right to take legal action [SECTION] Section 5: Getting Help Quickly [SECTION] For urgent situations like mortgage applications: [SECTION] • Call the DFPI Dispute Hotline at 1-800-555-REPORT (option 2) [SECTION] • Submit Form CR-EXPEDITE with proof of urgency [SECTION] • Visit a DFPI office by appointment [SECTION] Note: There may be a $25 fee if you dispute the same item more than twice. Medical debt disputes require extra documentation.[SECTION] California Credit Reporting Compliance Policy – FCRA Section 605 Enforcement [SECTION] Document Ref: CRCP-2025-09 / Effective Immediately [SECTION] Section 1: FCRA Violation Penalties Under California Law [SECTION] The Fair Credit Reporting Act (FCRA) violations in California are subject to the following penalties: [SECTION] - **Incorrect Reporting (FCRA §605(a)):** $2,500 per violation, with additional civil penalties up to $10,000 for willful non-compliance. [SECTION] - **Failure to Investigate Disputes (FCRA §611):** Mandatory $1,000 penalty per unresolved dispute, plus actual damages if litigation ensues. [SECTION] - **Unauthorized Access (FCRA §604):** Statutory damages of $3,000 per instance, plus potential criminal charges under California Penal Code 502(c). [SECTION] Section 2: Mandatory Corrective Actions [SECTION] Upon identification of an FCRA violation, regulated entities must: [SECTION] 1. Submit Form CR-25 (Credit Reporting Correction Notice) to the California Department of Financial Protection and Innovation (DFPI) within 5 business days. [SECTION] 2. Provide corrected information to all affected consumers via certified mail (Form CR-30) within 10 business days. [SECTION] 3. File an attestation of compliance (Form CR-35) with the CFPB within 15 business days, including: [SECTION] - Copies of corrected consumer reports [SECTION] - Proof of consumer notification [SECTION] - Internal audit documentation [SECTION] Section 3: Consumer Eligibility for Remedies [SECTION] Consumers may file claims if they meet these criteria: [SECTION] - Demonstrated financial harm via bank statements showing denied credit applications or increased interest rates. [SECTION] - Timely submission of Form CR-40 (Consumer Dispute Affidavit) within 60 days of violation discovery. [SECTION] - Documentation of at least two unsuccessful dispute attempts with the credit bureau (retain copies of Form CR-45). [SECTION] Section 4: Enforcement Timeline [SECTION] - **Immediate Actions (0-5 days):** Credit bureaus must place fraud alerts or security freezes upon request (Form CR-50). [SECTION] - **Investigation Phase (30 days):** Regulated entities must complete dispute investigations per FCRA §611(a)(1). [SECTION] - **Remediation Deadline (45 days):** All corrections must be reflected in consumer reports by this date. [SECTION] Section 5: Consequences of Non-Compliance [SECTION] Failure to adhere to these requirements results in: [SECTION] - Automatic referral to the California Attorney General's Office for enforcement action. [SECTION] - Suspension of credit reporting privileges for 90 days (extendable to 180 days for repeat violations). [SECTION] - Mandatory participation in quarterly FCRA compliance audits for two years. [SECTION] Section 6: Contact Information [SECTION] For compliance questions or dispute submissions: [SECTION] DFPI Credit Reporting Division: 1-800-555-7890 [SECTION] Email: cr.compliance@dfpi.ca.gov [SECTION] Overnight Mail: DFPI – CR Unit, 1500 11th Street, Sacramento, CA 95814[SECTION] California Credit Reporting Policy – 2025 Consumer Rights Update [SECTION] Document Ref: CCRP-2025-09 / Effective March 2025 [SECTION] Section 1: Requesting Your Credit Report [SECTION] California residents may request free credit reports under the following conditions: [SECTION] - **Annual Request**: One free report per year from each major bureau (Equifax, Experian, TransUnion) via AnnualCreditReport.com or Form CR-22. [SECTION] - **Additional Requests**: Free reports are available if: [SECTION] * You’ve been denied credit, employment, or housing within the last 60 days (submit denial letter with Form CR-25) [SECTION] * You’re a victim of identity theft (submit police report or FTC affidavit with Form CR-28) [SECTION] * You’re unemployed and plan to seek employment within 90 days (submit unemployment verification with Form CR-30) [SECTION] Section 2: Disputing Errors [SECTION] To dispute inaccuracies on your credit report: [SECTION] 1. **Document the Error**: Identify the incorrect item and gather supporting evidence (e.g., bank statements, payment confirmations). [SECTION] 2. **Submit Dispute**: File online through the credit bureau’s dispute portal or mail Form CR-40 with: [SECTION] * Copy of your credit report with errors circled [SECTION] * Proof of identity (CA driver’s license or state ID) [SECTION] * Supporting documentation [SECTION] 3. **Await Investigation**: Bureaus must respond within 30 days (45 days if submitting additional evidence later). [SECTION] Section 3: HIPAA-Related Medical Debt Reporting [SECTION] Under HIPAA Privacy Rule §164.528, medical debt reporting must comply with: [SECTION] - **Consent Requirement**: Providers must obtain written consent (Form HIPAA-15) before reporting medical debt to credit bureaus. [SECTION] - **Dispute Process**: If medical debt appears without consent, submit Form CR-45 to the bureau with a copy of the unsigned HIPAA-15. [SECTION] - **Removal Timeline**: Unauthorized medical debt must be removed within 5 business days of dispute receipt. [SECTION] Section 4: Contact Information [SECTION] For credit report assistance in California: [SECTION] - **Equifax**: 1-800-685-1111 | PO Box 740241, Atlanta, GA 30374 [SECTION] - **Experian**: 1-888-397-3742 | PO Box 4500, Allen, TX 75013 [SECTION] - **TransUnion**: 1-800-916-8800 | PO Box 2000, Chester, PA 19016 [SECTION] - **CA Attorney General’s Office**: 1-800-952-5225 | credit.reports@doj.ca.gov [SECTION] - **HIPAA Complaints**: Submit Form OCR-200 to the U.S. Department of Health and Human Services [SECTION] Section 5: Penalties for Non-Compliance [SECTION] Violations of credit reporting laws may result in: [SECTION] - **Bureau Penalties**: $2,500 per willful violation under the FCRA [SECTION] - **Provider Penalties**: Up to $50,000 per HIPAA violation for unauthorized medical debt reporting [SECTION] - **Consumer Remedies**: Actual damages plus attorney fees for successful lawsuits [SECTION] Note: Retain copies of all correspondence for at least 3 years. Dispute status can be checked online or by calling the bureau’s toll-free number.", llm_fn=my_llm, reprompting_config=base_config, user_instructions=["use the letter e only", "do not use the letter e"] ) + print_result("No System Prompt Test", result) assert "best_response" in result @pytest.mark.integration def test_with_system_prompt(my_llm, base_config): + """Test behavior when a system prompt is explicitly provided.""" result = run_reprompting_pipeline( - user_query="What's the policy?", - context="Context for telemetry disabled run", + user_query="Can I dispute my background check results if they’re wrong?", + context="[SECTION] California Employment Background Check Dispute Process – FCRA Compliance [SECTION] Effective: July 2025 / Policy Ref: BGC-2025-07 [SECTION] This document outlines the formal dispute procedure for inaccurate background check reports under Fair Credit Reporting Act (FCRA) §611 and California Civil Code §1786.16. [SECTION] SECTION 1: ELIGIBILITY REQUIREMENTS [SECTION] You may dispute your background check report if: [SECTION] - The report contains factual errors (e.g., incorrect employment dates, misattributed criminal records, or expired violations) [SECTION] - The disputed information falls within FCRA's 7-year reporting period (10 years for positions with salaries exceeding $125,000 annually) [SECTION] - You have not previously disputed the same item within the past 12 months [SECTION] SECTION 2: DISPUTE SUBMISSION PROCESS [SECTION] Step 1: Documentation Preparation [SECTION] - Complete Form BGC-2025-D (Dispute Request) with: [SECTION] * Notarized statement of inaccuracy [SECTION] * Supporting evidence (pay stubs, court disposition forms, or government-issued ID) [SECTION] * For healthcare positions: current medical license (Form MC-114) and malpractice insurance verification [SECTION] Step 2: Submission Methods [SECTION] - Secure Online Portal: Upload documents at bgcdispute.ca.gov (Case Type 45B) [SECTION] - Certified Mail: Send to California Background Check Bureau, PO Box 980, Sacramento, CA 95812 [SECTION] - In-Person: Submit at any California Department of Consumer Affairs office with appointment (Form APT-2025 required) [SECTION] Step 3: Processing Timeline [SECTION] - Acknowledgement issued within 3 business days via Form BGC-2025-R [SECTION] - Investigation completed within 30 calendar days (45 days for complex cases involving multiple jurisdictions) [SECTION] - Corrected reports distributed to you and requesting employer within 5 business days of resolution [SECTION] SECTION 3: CONSEQUENCES OF NON-COMPLIANCE [SECTION] - Incomplete submissions will be rejected and require re-filing (Form BGC-2025-RF) with $25 processing fee [SECTION] - Frivolous disputes (more than 3 in 12 months) may result in 6-month filing suspension [SECTION] - Employment applications may be automatically denied if dispute isn't resolved before employer's decision deadline [SECTION] SECTION 4: EXAMPLE SCENARIOS [SECTION] Example 1: A report lists a dismissed misdemeanor from 2022. Submit Form BGC-2025-D with court dismissal paperwork (Form CR-180) for automatic removal. [SECTION] Example 2: For incorrect drug test results, include lab retest documentation (Form DT-114) and chain-of-custody records. [SECTION] SECTION 5: CONTACT INFORMATION [SECTION] For assistance: [SECTION] - Phone: 1-800-555-1234 (Monday-Friday, 8 AM to 5 PM PST) [SECTION] - Email: bgc.disputes@ca.gov (Response within 2 business days) [SECTION] - In-Person: Schedule appointments using Form APT-2025 at approved locations [SECTION] Note: All disputes are subject to audit under FCRA §609 and may require additional verification. Fraudulent submissions may result in legal action under California Penal Code §532.", llm_fn=my_llm, reprompting_config=base_config, user_instructions=["use the letter e only", "do not use the letter e", "use a neutral tone"], system_prompt="this is a system prompt" ) + print_result("With System Prompt Test", result) assert "best_response" in result assert "telemetry" in result assert "summary" in result From e7a4410de11e8bdd45998adcfbd449549587d9f8 Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Wed, 30 Jul 2025 10:49:23 -0700 Subject: [PATCH 03/10] cleaning up toxicity visbility in telemetry (scores, response_feedback). updating residual error score calculation to have no penalty for adhereed instructions (follow probability >= 0.5) as per Alex's recommendation. also updated the description of a test case --- aimon/reprompting_api/pipeline.py | 7 +-- aimon/reprompting_api/telemetry.py | 2 +- .../tests/test_reprompting_cases.py | 6 +-- aimon/reprompting_api/utils.py | 44 +++++++++---------- 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/aimon/reprompting_api/pipeline.py b/aimon/reprompting_api/pipeline.py index 61de0e8..13cd7f1 100644 --- a/aimon/reprompting_api/pipeline.py +++ b/aimon/reprompting_api/pipeline.py @@ -1,7 +1,7 @@ from aimon.reprompting_api.config import RepromptingConfig, StopReasons from aimon.reprompting_api.telemetry import TelemetryLogger from aimon.reprompting_api.reprompter import Reprompter -from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score +from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score, get_failed_toxicity_instructions from aimon import Detect import time import random @@ -326,9 +326,10 @@ def get_response_feedback(self, result): """ scores = { "groundedness": result.detect_response.groundedness.get("score", 0.0), - "instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0) + "instruction_adherence": result.detect_response.instruction_adherence.get("score", 0.0), + "toxicity": result.detect_response.toxicity.get("score", 0.0) } - feedback = get_failed_instructions(result) + feedback = get_failed_instructions(result) + get_failed_toxicity_instructions(result) return scores, feedback def _build_corrective_prompt(self, payload, result): diff --git a/aimon/reprompting_api/telemetry.py b/aimon/reprompting_api/telemetry.py index a301308..1d408c2 100644 --- a/aimon/reprompting_api/telemetry.py +++ b/aimon/reprompting_api/telemetry.py @@ -51,7 +51,7 @@ def emit( "residual_error": residual_error, "failed_instructions_count": failed_instructions_count, "stop_reason": stop_reason, - "promp_template": prompt, + "prompt_template": prompt, "response_text": response_text, } self.memory_store.append(telemetry) diff --git a/aimon/reprompting_api/tests/test_reprompting_cases.py b/aimon/reprompting_api/tests/test_reprompting_cases.py index 89587b6..5dfa5ea 100644 --- a/aimon/reprompting_api/tests/test_reprompting_cases.py +++ b/aimon/reprompting_api/tests/test_reprompting_cases.py @@ -88,7 +88,7 @@ def config_iteration_limit(): return_telemetry=True, return_aimon_summary=True, application_name="api_test", - max_iterations=-1, + max_iterations=1, ) # --- Helper to print results nicely --- @@ -132,7 +132,7 @@ def test_latency_limit(my_llm, config_high_latency): @pytest.mark.integration def test_iteration_limit(my_llm, config_iteration_limit): - """Test behavior when max_iterations is unlimited (-1).""" + """Test behavior when max_iterations is 1.""" result = run_reprompting_pipeline( user_query="Our systems are showing vulnerability alerts but we can't find the patch file in the vendor portal. What should we do?", system_prompt="Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)", @@ -141,7 +141,7 @@ def test_iteration_limit(my_llm, config_iteration_limit): reprompting_config=config_iteration_limit, user_instructions=["do not use the letter e","only use the letter e"] ) - print_result("Iteration Limit Test (-1 = unlimited)", result) + print_result("Iteration Limit Test (no re-prompting, only 1 iteration allowed)", result) assert "best_response" in result @pytest.mark.integration diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py index 380ef87..e1144a8 100644 --- a/aimon/reprompting_api/utils.py +++ b/aimon/reprompting_api/utils.py @@ -133,33 +133,30 @@ def get_residual_error_score(result): Compute a normalized residual error score (0–1) based on: - Groundedness follow probabilities - Instruction adherence follow probabilities - - Toxicity failures (adds a strong penalty) + - Toxicity (inverted: 1 - follow_probability) Logic: - 1. Compute a base penalty using groundedness & adherence: - - Each instruction's penalty = (1 - p), doubled if p < 0.5. - - Average across all instructions for a base score. - 2. Add a flat toxicity penalty (+0.3) if any toxicity failures exist. - 3. Clamp the final score to [0,1]. - - Args: - result: AIMon detection result with `instruction_adherence`, `groundedness`, and `toxicity` sections. - - Returns: - float: Residual error score (0 = perfect, 1 = worst). The float is rounded to two decimal places. + 1. Collect follow probabilities for groundedness & adherence. + 2. For toxicity, use 1 - follow_probability (since high follow = low error). + 3. Compute a penalized average using the helper. + 4. Clamp the final score to [0,1]. """ - combined_probs = [ - item["follow_probability"] - for source in ["groundedness", "instruction_adherence"] - for item in getattr(result.detect_response, source, {}).get("instructions_list", []) - ] - base_penalty = penalized_average(combined_probs) if combined_probs else 0.0 + combined_probs = [] - toxicity_penalty = 0.3 if _count_toxicity_failures(result) > 0 else 0.0 + for source in ["groundedness", "instruction_adherence"]: + combined_probs.extend([ + item["follow_probability"] + for item in getattr(result.detect_response, source, {}).get("instructions_list", []) + ]) - residual_error_score = base_penalty + toxicity_penalty - residual_error_score = min(1.0, max(0.0, residual_error_score)) + # For toxicity, invert the follow probability + combined_probs.extend([ + 1 - item["follow_probability"] + for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", []) + ]) + residual_error_score = penalized_average(combined_probs) if combined_probs else 0.0 + residual_error_score = min(1.0, max(0.0, residual_error_score)) return round(residual_error_score, 2) @@ -167,7 +164,8 @@ def penalized_average(probs: List[float]) -> float: """ Compute a penalized average of follow probabilities. - Penalizes probabilities <0.5 more heavily by doubling their penalty. + Penalizes probabilities <0.5 more heavily by doubling their penalty. + Probabilities > 0.5 (passed instructions) recieve no penalty Args: probs (List[float]): A list of follow probabilities. @@ -178,7 +176,7 @@ def penalized_average(probs: List[float]) -> float: penalties = [] for p in probs: if p >= 0.5: - penalty = 1 - p + penalty = 0 else: penalty = (1 - p) * 2 # heavier penalty penalties.append(penalty) From dc068fded5c149008a5ea222fd8d7d8f47304a2c Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Wed, 30 Jul 2025 11:39:04 -0700 Subject: [PATCH 04/10] updating the success test case to include complete context and system prompt. --- .../reprompting_api/tests/test_reprompting_success.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/aimon/reprompting_api/tests/test_reprompting_success.py b/aimon/reprompting_api/tests/test_reprompting_success.py index 10cc66f..5a6d2e6 100644 --- a/aimon/reprompting_api/tests/test_reprompting_success.py +++ b/aimon/reprompting_api/tests/test_reprompting_success.py @@ -64,14 +64,13 @@ def test_successful_run(): ) logger.info("[Pipeline] Config prepared.") - user_query = "what are the drug tiers?" - context = "[SECTION] 📘 BlueShield Rx Policy Addendum: 2023–2025 ... (truncated for brevity)" + user_query = "What are the drug tiers?" + context = "[SECTION] 📘 BlueShield Rx Policy Addendum: 2023–2025 Commercial & Employer-Sponsored Plans [SECTION] Confidential – Not for external dissemination without compliance review. [SECTION] 🔹 Section 2.1.7 – Drug Coverage Eligibility Matrix [SECTION] Prescription drug eligibility is governed by a tiered, multi-variant benefit design informed by annual P&T Committee decisions, manufacturer rebates, CMS Part D benchmarking (when applicable), and employer-specific customizations. The following formulary tiers apply unless superseded by a group rider or conditional override: [SECTION] - **Tier 1 (Generic Core):** Includes FDA-approved AB-rated generics; requires no PA or ST, unless the member is flagged under the Risk Management Tier Hold (RMTH) protocol due to prior misuse. [SECTION] - **Tier 2 (Preferred Brand & Enhanced Generics):** Coverage dependent on documented trial/failure of Tier 1 alternatives unless contraindicated. Members in the Legacy Bridge plan must obtain both prescriber attestation and pharmacy alignment verification. [SECTION] - **Tier 3 (Non-Preferred & Specialty Entry):** May require dual-layer review if member has not met chronic condition enrollment criteria (CCE) in the last benefit year. Tier migration possible mid-cycle based on new formulary rules. [SECTION] - **Tier 4 (Specialty Injectables, Biologics, and Condition-Limited Agents):** Includes drugs subject to clinical pathway alignment; claims must be adjudicated through the PBM’s split-fulfillment logic unless the prescribing entity is credentialed as Tier 4-A. [SECTION] 🚫 Exception: Certain biosimilars classified under Tier 4 in national formularies may be covered at Tier 2 if dispensed under limited-distribution contracts, provided the prescribing facility participates in the 340B program **and** the member is flagged under Enhanced Affordability Priority (EAP). [SECTION] 🔁 **Prior Authorization (PA) Layering Logic** [SECTION] Drugs requiring PA are subject to a three-stage filter: [SECTION] 1. **Therapeutic Criteria Review (TCR)** – Clinical alignment with diagnosis and formulary path. [SECTION] 2. **Coverage Policy Sync (CPS)** – Matches requested use with plan sponsor coverage schema. [SECTION] 3. **Utilization Watch Flag (UWF)** – If triggered, a third-party medical director review is initiated (adds 2–4 business days). [SECTION] 💡 Exemplar: *Trulicity* (GLP-1 receptor agonist) [SECTION] - **Base Tier:** Tier 3 across most commercial plans [SECTION] - **Override Possibility:** Auto-lifts to Tier 2 under Metabolic Risk Bundling if member is concurrently enrolled in cardiac risk management AND insulin titration modules. [SECTION] - **Caveat:** Auto-injector version may still trigger UWF if prescribed without 90-day adherence documentation to metformin or contraindication to semaglutide. [SECTION] 🗂️ **Adjudication Complexity Notes** [SECTION] - Fill attempts at non-network or out-of-state pharmacies may default to full retail pricing, even if coverage is active. [SECTION] - Certain maintenance tier drugs can only be filled at 90-day intervals after two successful 30-day fills unless dispensed via SmartSync (auto-align refill system). [SECTION] - Claims using discount cards (e.g., manufacturer copay assistance) will not count toward deductible or out-of-pocket limits unless the pharmacy submits a Coordinated Adjudication Adjustment Request (CAAR). [SECTION] ⚠️ **Denials & Appeals** [SECTION] - If PA is denied, appeals must cite new clinical rationale. Re-submission of identical documentation will be auto-denied. [SECTION] - Members in Tier Restructuring Delay (TRD) periods due to employer override cannot file external appeals unless the drug is life-sustaining and not replaceable under Tier 1/2. [SECTION] - Denials on non-formulary drugs are not eligible for Tier Transition Program (TTP) unless covered during prior plan year with no lapse in coverage >30 days. [SECTION] 📊 **Plan Differences** [SECTION] - Standard, Enhanced, Platinum, and Concierge tiers each have different deductible-accumulation thresholds and copay structures. [SECTION] - For Platinum+ plans, Tier 3 copay is waived on first-time fills initiated post-discharge from an inpatient episode if coded using post-acute NDCs. [SECTION] 📣 Misc. Clarifications [SECTION] - The “Healthy Living Rewards” program, mentioned in new member packets, does not affect coverage or drug tier placement. It is a wellness initiative only. [SECTION] - Benefit year resets on Jan 1, but tier realignment occurs quarterly and may retroactively affect claims filled in the trailing 45-day buffer period. [SECTION] 🔒 REMINDER: Member Services guidance may reflect outdated tier assignments if formulary refreshes are in progress. Online lookup tools update in real time and take precedence during adjudication disputes." + system_prompt = "You are a knowledgeable but approachable healthcare benefits assistant. Your role is to help users understand BlueShield prescription drug policies by explaining terms and tiers in simple, clear, and user‑friendly language. Always prioritize accuracy and clarity over technical jargon." user_instructions = [ "Avoid overly technical or robotic phrasing; keep the tone human and accessible.", "Ensure the response is direct and professional, with minimal informal tone.", - "Translate or simplify technical details from the context into accurate, user-friendly explanations.", - "Don't use the letter e", - "only use the letter e" + "Translate or simplify technical details from the context into accurate, user-friendly explanations." ] logger.info(f"[Pipeline] User query: {user_query}") logger.info(f"[Pipeline] Context: {context[:100]}...") @@ -81,7 +80,7 @@ def test_successful_run(): result = run_reprompting_pipeline( llm_fn=my_llm, user_query=user_query, - system_prompt="here is a system prompt", + system_prompt= system_prompt, context=context, user_instructions=user_instructions, reprompting_config=config From c2bac4bc461492121c9085af32b7ab4e1fe69d78 Mon Sep 17 00:00:00 2001 From: Aanya Shah <66803590+ashah-aanya@users.noreply.github.com> Date: Wed, 30 Jul 2025 13:25:59 -0700 Subject: [PATCH 05/10] Adding a Colab notebook to guide re-promping pipeline integration and construct an example reprompting flow. --- re_prompting_pipeline_demo.ipynb | 450 +++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 re_prompting_pipeline_demo.ipynb diff --git a/re_prompting_pipeline_demo.ipynb b/re_prompting_pipeline_demo.ipynb new file mode 100644 index 0000000..390dda9 --- /dev/null +++ b/re_prompting_pipeline_demo.ipynb @@ -0,0 +1,450 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyOW+bJJBTfcLjz4nzXbAVpZ", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Automated Re-Prompting Pipeline Demo\n", + "This notebook walks through how to use AIMon's re-prompting pipeline to refine LLM outputs.\n", + "\n", + "With this pipeline, you can:\n", + "\n", + "* Unlock **GPT‑4o‑level performance** using lightweight 3–7B parameter models\n", + "* Boost **instruction adherence by ~22%**\n", + "* Minimize hallucinations while keeping responses fast\n", + "\n", + "**How it works:**\n", + "\n", + "![repromptingpipeline.png]()\n", + "\n", + "1. **Generate:** A base LLM produces an initial response.\n", + "2. **Detect:** AIMon’s IFE model evaluates the response for instruction adherence, hallucination, and toxicity.\n", + "3. **Re‑prompt:** If issues are detected, the pipeline generates a targeted corrective prompt.\n", + "4. **Refine:** The LLM revises its answer using this feedback, repeating until criteria are met or specified limits are reached.\n", + "\n", + "\n", + "**Goal:** \n", + "By the end, you'll know how to integrate the re‑prompting pipeline into your own workflows and tune it for different performance/quality needs. In this demo, we will use TogetherAI to call Mistral 7B. However, the pipeline is model‑ and framework‑agnostic so it is easy to apply to any black‑box LLM and integrate with your existing retrieval stack." + ], + "metadata": { + "id": "oFQnzqtuGDSV" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Prerequisites" + ], + "metadata": { + "id": "hORx_VHbKmt_" + } + }, + { + "cell_type": "markdown", + "source": [ + "1. Get your API keys\n", + "\n", + " AIMon: Instructions [available here](https://docs.aimon.ai/quickstart#1-api-key)\n", + "\n", + " TogetherAI: Get your TogetherAI API Key [available here](https://www.together.ai/)\n", + "\n", + " Once you obtained this keys, configure your `AIMON_API_KEY` and `TOGETHER_API_KEY` in Google Collab secrets and provide them notebook access. We will TogetherAI to facilitate LLM calls and AIMon to evaluate instruction adherence, groundedness, and toxicity of generated outputs to fuel iterative improvement." + ], + "metadata": { + "id": "Sh8290AOKvry" + } + }, + { + "cell_type": "markdown", + "source": [ + "2. Install the dependencies." + ], + "metadata": { + "id": "1aST8279LbKq" + } + }, + { + "cell_type": "code", + "source": [ + "%%capture\n", + "!pip install together --quiet\n", + "%cd /content\n", + "!rm -rf aimon-python-sdk\n", + "!git clone -b reprompting-api https://github.com/aimonlabs/aimon-python-sdk.git\n", + "%cd aimon-python-sdk\n", + "!pip install -e ." + ], + "metadata": { + "id": "0AYy5Ru2L18p" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "3. Load the API keys into environment variables using the following cell." + ], + "metadata": { + "id": "gOn6QGhJNMOs" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "# Import Colab Secrets userdata module.\n", + "from google.colab import userdata\n", + "\n", + "os.environ['TOGETHER_API_KEY'] = userdata.get('TOGETHER_API_KEY')\n", + "os.environ['AIMON_API_KEY'] = userdata.get('AIMON_API_KEY')" + ], + "metadata": { + "id": "pCgtl98mNL_D" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 1: Set up your LLM function\n", + "\n", + "To use the re‑prompting pipeline, you must provide your own Callable LLM function. This function acts as the connector between the pipeline and any black‑box model (e.g., TogetherAI, OpenAI, Anthropic, or a local model). In this case we are using Mistral7B via TogetherAI.\n", + "\n", + "The function should take in the following parameters:\n", + "\n", + "* **`recommended_prompt_template` (string.Template):** the corrective prompt template generated by the pipeline.\n", + "* **`system_prompt` (str):** system-level instructions or guidelines for model behavior.\n", + "* **`context` (str):** the contextual information or reference material relevant to the query. This is typically passed from a retrieval step or knowledge base.\n", + "* **`user_query` (str):** the user's query.\n", + "\n", + "**Return value:**\n", + "Your function must return a single string containing the model's generated response.\n", + "\n", + "How it works:\n", + "\n", + "1. Receive the corrective prompt as a string.Template.\n", + "\n", + "2. Substitute placeholders (system_prompt, context, user_query) into the template. You can alternatively implement your own template or modify the provided one for more control.\n", + "\n", + "3. Send the filled prompt to your chosen model (e.g., TogetherAI, OpenAI, Anthropic, local model).\n", + "\n", + "4. Return the model’s response as plain text.\n", + "\n", + "> Try swapping `model` for any Together-hosted model (e.g., 'mistralai/Mistral-7B-Instruct-v0.2'). You can also replace the whole block with any LLM call (OpenAI, Claude, HuggingFace, etc.)\n" + ], + "metadata": { + "id": "MhWcamP7PKZZ" + } + }, + { + "cell_type": "code", + "source": [ + "from string import Template\n", + "from together import Together\n", + "\n", + "TOGETHER_API_KEY = os.environ.get(\"TOGETHER_API_KEY\")\n", + "client = Together(api_key=TOGETHER_API_KEY)\n", + "\n", + "def my_llm(recommended_prompt_template: Template, system_prompt, context, user_query) -> str:\n", + "\n", + " # substitute placeholders in the pipeline-provided template with appropriate values\n", + " filled_prompt = recommended_prompt_template.substitute(\n", + " system_prompt=system_prompt,\n", + " context=context,\n", + " user_query=user_query\n", + " )\n", + "\n", + " # replace this block with any LLM call you want. (OpenAI, Claude, HuggingFace, etc.)\n", + " response = client.chat.completions.create(\n", + " model=\"google/gemma-3n-E4B-it\", # this can be any Together-hosted model (e.g., 'mistralai/Mistral-7B-Instruct-v0.2')\n", + " messages=[{\"role\": \"user\", \"content\": filled_prompt}],\n", + " max_tokens=256, # increase for longer outputs\n", + " temperature=0 # raise for more creative outputs\n", + " )\n", + "\n", + " # extract and return a string output\n", + " output = response.choices[0].message.content\n", + " return output" + ], + "metadata": { + "id": "XHYBsQHCQyCD" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 2: Set up AIMon's RepromptingConfig\n", + "\n", + "The RepromptingConfig object controls how the pipeline behaves. Here are some of the key parameters:\n", + "\n", + "* **`aimon_api_key` (str)** Your AIMon API key, used to call the Instruction‑Following Evaluation (IFE) model that scores and guides the LLM’s outputs. If not provided directly, the pipeline looks for the key in the AIMON_API_KEY environment variable.\n", + "\n", + "* **`return_telemetry` (bool)** If True, the pipeline returns a detailed JSON log of every iteration. This includes each draft response, feedback, and corrective prompt. Great for debugging or analyzing model behavior. Default False.\n", + "\n", + "* **`return_aimon_summary` (bool)** If True, the pipeline returns a short summary: \"[2 iterations, 0 failed instruction remaining].\" Default False.\n", + "\n", + "* **`max_iterations` (int)** Sets how many times the pipeline will attempt to refine the model’s output. For example, 3 means “1 initial response + up to 2 corrective re‑prompts.” 2-3 recommended. Default 2.\n", + "\n" + ], + "metadata": { + "id": "GZO6E9Nwf9sd" + } + }, + { + "cell_type": "code", + "source": [ + "from aimon.reprompting_api.config import RepromptingConfig\n", + "\n", + "config = RepromptingConfig(\n", + " aimon_api_key=os.getenv(\"AIMON_API_KEY\"),\n", + " return_telemetry=True,\n", + " return_aimon_summary=True,\n", + " max_iterations=3\n", + " )" + ], + "metadata": { + "id": "ZcsbXRnHixdC" + }, + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 3: Run the re-prompting pipeline" + ], + "metadata": { + "id": "SmvfimYSlSDT" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### 1. Define test inputs.\n", + "\n", + "Before running the pipeline, you need to provide:\n", + "\n", + "* **`user_query` (required):** The actual question or task for the model.\n", + "\n", + "* **`system_prompt`:** A high‑level role or behavior definition for the model.\n", + "\n", + "* **`context` (recommended):** Relevant background text for the model to reference (e.g., a document, policy, or knowledge base excerpt).\n", + "\n", + "* **`user_instructions` (recommended):** Specific, deterministic guidelines for how the response should be written. These are what AIMon uses to evaluate and iteratively improve the model’s output.\n", + "\n", + "> **Tip:** The more specific and deterministic your instructions, the more effective the re-prompting loop will be." + ], + "metadata": { + "id": "yn9PEGUrlz4Q" + } + }, + { + "cell_type": "code", + "source": [ + "user_query = \"All my loans are for graduate studies, at which date in the future are they forgiven?\"\n", + "system_prompt = \"You are a knowledgeable but approachable student loan advisor. Your role is to help borrowers understand complex federal repayment and forgiveness programs by breaking down policy details into clear, accurate, and accessible explanations. Always provide complete information necessary to answer the question, while maintaining a professional and neutral tone.\"\n", + "context = \"Income-Driven Repayment (IDR) plans help lower federal student loan payments based on income and family size, with four options available: REPAYE, PAYE, IBR, and ICR. REPAYE and PAYE require payments of 10% of discretionary income, with forgiveness after 20 years for undergraduate loans and 25 years for graduate loans. IBR varies between 10% and 15% based on when loans were taken, with forgiveness after 20 or 25 years, while ICR calculates payments as the lesser of 20% of discretionary income or a fixed 12-year repayment amount, with forgiveness after 25 years. Eligibility requires a verified FSA ID, recent income documentation, and annual recertification of income and family size; failure to recertify may increase payments and interest. Joint filers must include their spouse’s income unless separated, and only Direct Loans qualify unless others are consolidated. Private loans are ineligible, and borrowers in default must make satisfactory arrangements before enrolling.\"\n", + "user_instructions = [\n", + " \"Avoid overly technical or robotic phrasing; keep the tone human and accessible.\",\n", + " \"Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)\",\n", + " \"Response should not omit critical details needed to understand or answer the query\"\n", + "]" + ], + "metadata": { + "id": "Z4Qm_ME8nOKM" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### 2. Call run_reprompting_pipeline()\n", + "\n", + "\n", + "Now that the configuration and test inputs are ready, we can pass it along with our LLM function to the pipeline. This will:\n", + "\n", + "* Send the input to your chosen LLM.\n", + "\n", + "* Use AIMon’s detectors to evaluate instruction adherence, groundedness, and toxicity.\n", + "\n", + "* Automatically generate corrective prompts and iterate up to max_iterations." + ], + "metadata": { + "id": "OKSFSglArHpw" + } + }, + { + "cell_type": "code", + "source": [ + "from aimon.reprompting_api.runner import run_reprompting_pipeline\n", + "\n", + "result = run_reprompting_pipeline(\n", + " llm_fn=my_llm,\n", + " user_query=user_query,\n", + " system_prompt= system_prompt,\n", + " context=context,\n", + " user_instructions=user_instructions,\n", + " reprompting_config=config\n", + " )" + ], + "metadata": { + "id": "p7GtwY9Cra0e" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 4: View your improved generated text and telemetry!" + ], + "metadata": { + "id": "w6xyeUFJrnho" + } + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "\n", + "print(\"\\n=== SUMMARY ===\")\n", + "print(result.get(\"summary\"))\n", + "print(\"\\n=== BEST RESPONSE ===\")\n", + "print(result[\"best_response\"])\n", + "print(\"\\n=== TELEMETRY ===\")\n", + "print(json.dumps(result.get(\"telemetry\"), indent=2))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AVke0epgru1d", + "outputId": "3efbdcc5-bbc0-42e3-b8a1-f4fa5a75e3a3" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "=== SUMMARY ===\n", + "2 iterations, 0 failed instructions remaining\n", + "\n", + "=== BEST RESPONSE ===\n", + "Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\n", + "\n", + "Here's a breakdown based on the IDR options available:\n", + "\n", + "* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\n", + "* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\n", + "* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\n", + "\n", + "**To determine the specific forgiveness date for your loans, I need a little more information:**\n", + "\n", + "1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under.\n", + "\n", + "=== TELEMETRY ===\n", + "[\n", + " {\n", + " \"iteration\": 1,\n", + " \"cumulative_latency_ms\": 49848.28877449036,\n", + " \"scores\": {\n", + " \"groundedness\": 1.0,\n", + " \"instruction_adherence\": 0.6666666666666666,\n", + " \"toxicity\": 0.0\n", + " },\n", + " \"response_feedback\": [\n", + " {\n", + " \"type\": \"instruction_adherence_failure\",\n", + " \"source\": \"instruction_adherence\",\n", + " \"instruction\": \"Response should not omit critical details needed to understand or answer the query\",\n", + " \"score\": 0.4688,\n", + " \"explanation\": \"The response omits key details such as eligibility requirements and the need for annual recertification, leaving important information out.\"\n", + " }\n", + " ],\n", + " \"residual_error\": 0.08,\n", + " \"failed_instructions_count\": 1,\n", + " \"stop_reason\": \"instructions_failed_continue_reprompting\",\n", + " \"prompt_template\": \"System:\\n${system_prompt}\\n\\nContext:\\n${context}\\n\\nUser Query:\\n${user_query}\",\n", + " \"response_text\": \"Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under (\"\n", + " },\n", + " {\n", + " \"iteration\": 2,\n", + " \"cumulative_latency_ms\": 105391.15405082703,\n", + " \"scores\": {\n", + " \"groundedness\": 1.0,\n", + " \"instruction_adherence\": 1.0,\n", + " \"toxicity\": 0.0\n", + " },\n", + " \"response_feedback\": [],\n", + " \"residual_error\": 0.0,\n", + " \"failed_instructions_count\": 0,\n", + " \"stop_reason\": \"all_instructions_adhered\",\n", + " \"prompt_template\": \"Original system prompt:\\n${system_prompt}\\n\\nRevise your previous response to this query:\\n${user_query}\\n\\nContext:\\n${context}\\n\\nPrevious response:\\nOkay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under (\\n\\nAlmost there. Just a few small fixes needed.\\n\\nFix the following:\\n\\n1. We are 53.12% confident that the following instruction was not followed:\\n\\u2192 Violated Instruction: \\\"Response should not omit critical details needed to understand or answer the query\\\"\\n\\u2192 Explanation: The response omits key details such as eligibility requirements and the need for annual recertification, leaving important information out.\\n\\n\\nPreserve correct content. Return only the revised output with no extra explanation.\\nYou did well on these instructions. It is important that you continue to follow these instructions:\\n- Avoid overly technical or robotic phrasing; keep the tone human and accessible.\\n- Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)\\n\",\n", + " \"response_text\": \"Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under.\"\n", + " }\n", + "]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Conclusion\n", + "\n", + "In this notebook, we demonstrated how to use AIMon’s Re‑prompting Pipeline to automatically improve LLM responses with targeted feedback. We covered:\n", + "\n", + "* Building a `my_llm` function to fill templated prompts and call any LLM.\n", + "\n", + "* Adjusting `RepromptingConfig` to control pipeline behavior.\n", + "\n", + "* Compiling test inputs, including `system_prompt`, `context`, `user_query`, and `user_instructions`.\n", + "\n", + "* Running the pipeline and reviewing the improved outputs.\n", + "\n", + "With these steps, you can integrate re‑prompting into your own workflows to make model outputs more accurate, consistent, and user‑aligned.\n", + "\n", + "For deeper insights into how re‑prompting improves instruction adherence across models and test cases, check out our blog post [link].\n", + "\n", + "To learn more about available configurations and advanced options, visit our documentation [link].\n", + "\n" + ], + "metadata": { + "id": "g7hpVaYWvkTQ" + } + } + ] +} \ No newline at end of file From 9b9c25a3c1c7be35141b96d0d25026e3890e7fe9 Mon Sep 17 00:00:00 2001 From: Aanya Shah <66803590+ashah-aanya@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:55:10 -0700 Subject: [PATCH 06/10] Delete re_prompting_pipeline_demo.ipynbo Deleting Colab notebook demo as it is already linked elsewhere --- re_prompting_pipeline_demo.ipynb | 450 ------------------------------- 1 file changed, 450 deletions(-) delete mode 100644 re_prompting_pipeline_demo.ipynb diff --git a/re_prompting_pipeline_demo.ipynb b/re_prompting_pipeline_demo.ipynb deleted file mode 100644 index 390dda9..0000000 --- a/re_prompting_pipeline_demo.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyOW+bJJBTfcLjz4nzXbAVpZ", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Automated Re-Prompting Pipeline Demo\n", - "This notebook walks through how to use AIMon's re-prompting pipeline to refine LLM outputs.\n", - "\n", - "With this pipeline, you can:\n", - "\n", - "* Unlock **GPT‑4o‑level performance** using lightweight 3–7B parameter models\n", - "* Boost **instruction adherence by ~22%**\n", - "* Minimize hallucinations while keeping responses fast\n", - "\n", - "**How it works:**\n", - "\n", - "![repromptingpipeline.png]()\n", - "\n", - "1. **Generate:** A base LLM produces an initial response.\n", - "2. **Detect:** AIMon’s IFE model evaluates the response for instruction adherence, hallucination, and toxicity.\n", - "3. **Re‑prompt:** If issues are detected, the pipeline generates a targeted corrective prompt.\n", - "4. **Refine:** The LLM revises its answer using this feedback, repeating until criteria are met or specified limits are reached.\n", - "\n", - "\n", - "**Goal:** \n", - "By the end, you'll know how to integrate the re‑prompting pipeline into your own workflows and tune it for different performance/quality needs. In this demo, we will use TogetherAI to call Mistral 7B. However, the pipeline is model‑ and framework‑agnostic so it is easy to apply to any black‑box LLM and integrate with your existing retrieval stack." - ], - "metadata": { - "id": "oFQnzqtuGDSV" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Prerequisites" - ], - "metadata": { - "id": "hORx_VHbKmt_" - } - }, - { - "cell_type": "markdown", - "source": [ - "1. Get your API keys\n", - "\n", - " AIMon: Instructions [available here](https://docs.aimon.ai/quickstart#1-api-key)\n", - "\n", - " TogetherAI: Get your TogetherAI API Key [available here](https://www.together.ai/)\n", - "\n", - " Once you obtained this keys, configure your `AIMON_API_KEY` and `TOGETHER_API_KEY` in Google Collab secrets and provide them notebook access. We will TogetherAI to facilitate LLM calls and AIMon to evaluate instruction adherence, groundedness, and toxicity of generated outputs to fuel iterative improvement." - ], - "metadata": { - "id": "Sh8290AOKvry" - } - }, - { - "cell_type": "markdown", - "source": [ - "2. Install the dependencies." - ], - "metadata": { - "id": "1aST8279LbKq" - } - }, - { - "cell_type": "code", - "source": [ - "%%capture\n", - "!pip install together --quiet\n", - "%cd /content\n", - "!rm -rf aimon-python-sdk\n", - "!git clone -b reprompting-api https://github.com/aimonlabs/aimon-python-sdk.git\n", - "%cd aimon-python-sdk\n", - "!pip install -e ." - ], - "metadata": { - "id": "0AYy5Ru2L18p" - }, - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "3. Load the API keys into environment variables using the following cell." - ], - "metadata": { - "id": "gOn6QGhJNMOs" - } - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "\n", - "# Import Colab Secrets userdata module.\n", - "from google.colab import userdata\n", - "\n", - "os.environ['TOGETHER_API_KEY'] = userdata.get('TOGETHER_API_KEY')\n", - "os.environ['AIMON_API_KEY'] = userdata.get('AIMON_API_KEY')" - ], - "metadata": { - "id": "pCgtl98mNL_D" - }, - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Step 1: Set up your LLM function\n", - "\n", - "To use the re‑prompting pipeline, you must provide your own Callable LLM function. This function acts as the connector between the pipeline and any black‑box model (e.g., TogetherAI, OpenAI, Anthropic, or a local model). In this case we are using Mistral7B via TogetherAI.\n", - "\n", - "The function should take in the following parameters:\n", - "\n", - "* **`recommended_prompt_template` (string.Template):** the corrective prompt template generated by the pipeline.\n", - "* **`system_prompt` (str):** system-level instructions or guidelines for model behavior.\n", - "* **`context` (str):** the contextual information or reference material relevant to the query. This is typically passed from a retrieval step or knowledge base.\n", - "* **`user_query` (str):** the user's query.\n", - "\n", - "**Return value:**\n", - "Your function must return a single string containing the model's generated response.\n", - "\n", - "How it works:\n", - "\n", - "1. Receive the corrective prompt as a string.Template.\n", - "\n", - "2. Substitute placeholders (system_prompt, context, user_query) into the template. You can alternatively implement your own template or modify the provided one for more control.\n", - "\n", - "3. Send the filled prompt to your chosen model (e.g., TogetherAI, OpenAI, Anthropic, local model).\n", - "\n", - "4. Return the model’s response as plain text.\n", - "\n", - "> Try swapping `model` for any Together-hosted model (e.g., 'mistralai/Mistral-7B-Instruct-v0.2'). You can also replace the whole block with any LLM call (OpenAI, Claude, HuggingFace, etc.)\n" - ], - "metadata": { - "id": "MhWcamP7PKZZ" - } - }, - { - "cell_type": "code", - "source": [ - "from string import Template\n", - "from together import Together\n", - "\n", - "TOGETHER_API_KEY = os.environ.get(\"TOGETHER_API_KEY\")\n", - "client = Together(api_key=TOGETHER_API_KEY)\n", - "\n", - "def my_llm(recommended_prompt_template: Template, system_prompt, context, user_query) -> str:\n", - "\n", - " # substitute placeholders in the pipeline-provided template with appropriate values\n", - " filled_prompt = recommended_prompt_template.substitute(\n", - " system_prompt=system_prompt,\n", - " context=context,\n", - " user_query=user_query\n", - " )\n", - "\n", - " # replace this block with any LLM call you want. (OpenAI, Claude, HuggingFace, etc.)\n", - " response = client.chat.completions.create(\n", - " model=\"google/gemma-3n-E4B-it\", # this can be any Together-hosted model (e.g., 'mistralai/Mistral-7B-Instruct-v0.2')\n", - " messages=[{\"role\": \"user\", \"content\": filled_prompt}],\n", - " max_tokens=256, # increase for longer outputs\n", - " temperature=0 # raise for more creative outputs\n", - " )\n", - "\n", - " # extract and return a string output\n", - " output = response.choices[0].message.content\n", - " return output" - ], - "metadata": { - "id": "XHYBsQHCQyCD" - }, - "execution_count": 23, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Step 2: Set up AIMon's RepromptingConfig\n", - "\n", - "The RepromptingConfig object controls how the pipeline behaves. Here are some of the key parameters:\n", - "\n", - "* **`aimon_api_key` (str)** Your AIMon API key, used to call the Instruction‑Following Evaluation (IFE) model that scores and guides the LLM’s outputs. If not provided directly, the pipeline looks for the key in the AIMON_API_KEY environment variable.\n", - "\n", - "* **`return_telemetry` (bool)** If True, the pipeline returns a detailed JSON log of every iteration. This includes each draft response, feedback, and corrective prompt. Great for debugging or analyzing model behavior. Default False.\n", - "\n", - "* **`return_aimon_summary` (bool)** If True, the pipeline returns a short summary: \"[2 iterations, 0 failed instruction remaining].\" Default False.\n", - "\n", - "* **`max_iterations` (int)** Sets how many times the pipeline will attempt to refine the model’s output. For example, 3 means “1 initial response + up to 2 corrective re‑prompts.” 2-3 recommended. Default 2.\n", - "\n" - ], - "metadata": { - "id": "GZO6E9Nwf9sd" - } - }, - { - "cell_type": "code", - "source": [ - "from aimon.reprompting_api.config import RepromptingConfig\n", - "\n", - "config = RepromptingConfig(\n", - " aimon_api_key=os.getenv(\"AIMON_API_KEY\"),\n", - " return_telemetry=True,\n", - " return_aimon_summary=True,\n", - " max_iterations=3\n", - " )" - ], - "metadata": { - "id": "ZcsbXRnHixdC" - }, - "execution_count": 24, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Step 3: Run the re-prompting pipeline" - ], - "metadata": { - "id": "SmvfimYSlSDT" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### 1. Define test inputs.\n", - "\n", - "Before running the pipeline, you need to provide:\n", - "\n", - "* **`user_query` (required):** The actual question or task for the model.\n", - "\n", - "* **`system_prompt`:** A high‑level role or behavior definition for the model.\n", - "\n", - "* **`context` (recommended):** Relevant background text for the model to reference (e.g., a document, policy, or knowledge base excerpt).\n", - "\n", - "* **`user_instructions` (recommended):** Specific, deterministic guidelines for how the response should be written. These are what AIMon uses to evaluate and iteratively improve the model’s output.\n", - "\n", - "> **Tip:** The more specific and deterministic your instructions, the more effective the re-prompting loop will be." - ], - "metadata": { - "id": "yn9PEGUrlz4Q" - } - }, - { - "cell_type": "code", - "source": [ - "user_query = \"All my loans are for graduate studies, at which date in the future are they forgiven?\"\n", - "system_prompt = \"You are a knowledgeable but approachable student loan advisor. Your role is to help borrowers understand complex federal repayment and forgiveness programs by breaking down policy details into clear, accurate, and accessible explanations. Always provide complete information necessary to answer the question, while maintaining a professional and neutral tone.\"\n", - "context = \"Income-Driven Repayment (IDR) plans help lower federal student loan payments based on income and family size, with four options available: REPAYE, PAYE, IBR, and ICR. REPAYE and PAYE require payments of 10% of discretionary income, with forgiveness after 20 years for undergraduate loans and 25 years for graduate loans. IBR varies between 10% and 15% based on when loans were taken, with forgiveness after 20 or 25 years, while ICR calculates payments as the lesser of 20% of discretionary income or a fixed 12-year repayment amount, with forgiveness after 25 years. Eligibility requires a verified FSA ID, recent income documentation, and annual recertification of income and family size; failure to recertify may increase payments and interest. Joint filers must include their spouse’s income unless separated, and only Direct Loans qualify unless others are consolidated. Private loans are ineligible, and borrowers in default must make satisfactory arrangements before enrolling.\"\n", - "user_instructions = [\n", - " \"Avoid overly technical or robotic phrasing; keep the tone human and accessible.\",\n", - " \"Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)\",\n", - " \"Response should not omit critical details needed to understand or answer the query\"\n", - "]" - ], - "metadata": { - "id": "Z4Qm_ME8nOKM" - }, - "execution_count": 25, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### 2. Call run_reprompting_pipeline()\n", - "\n", - "\n", - "Now that the configuration and test inputs are ready, we can pass it along with our LLM function to the pipeline. This will:\n", - "\n", - "* Send the input to your chosen LLM.\n", - "\n", - "* Use AIMon’s detectors to evaluate instruction adherence, groundedness, and toxicity.\n", - "\n", - "* Automatically generate corrective prompts and iterate up to max_iterations." - ], - "metadata": { - "id": "OKSFSglArHpw" - } - }, - { - "cell_type": "code", - "source": [ - "from aimon.reprompting_api.runner import run_reprompting_pipeline\n", - "\n", - "result = run_reprompting_pipeline(\n", - " llm_fn=my_llm,\n", - " user_query=user_query,\n", - " system_prompt= system_prompt,\n", - " context=context,\n", - " user_instructions=user_instructions,\n", - " reprompting_config=config\n", - " )" - ], - "metadata": { - "id": "p7GtwY9Cra0e" - }, - "execution_count": 26, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Step 4: View your improved generated text and telemetry!" - ], - "metadata": { - "id": "w6xyeUFJrnho" - } - }, - { - "cell_type": "code", - "source": [ - "import json\n", - "\n", - "print(\"\\n=== SUMMARY ===\")\n", - "print(result.get(\"summary\"))\n", - "print(\"\\n=== BEST RESPONSE ===\")\n", - "print(result[\"best_response\"])\n", - "print(\"\\n=== TELEMETRY ===\")\n", - "print(json.dumps(result.get(\"telemetry\"), indent=2))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "AVke0epgru1d", - "outputId": "3efbdcc5-bbc0-42e3-b8a1-f4fa5a75e3a3" - }, - "execution_count": 27, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "=== SUMMARY ===\n", - "2 iterations, 0 failed instructions remaining\n", - "\n", - "=== BEST RESPONSE ===\n", - "Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\n", - "\n", - "Here's a breakdown based on the IDR options available:\n", - "\n", - "* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\n", - "* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\n", - "* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\n", - "\n", - "**To determine the specific forgiveness date for your loans, I need a little more information:**\n", - "\n", - "1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under.\n", - "\n", - "=== TELEMETRY ===\n", - "[\n", - " {\n", - " \"iteration\": 1,\n", - " \"cumulative_latency_ms\": 49848.28877449036,\n", - " \"scores\": {\n", - " \"groundedness\": 1.0,\n", - " \"instruction_adherence\": 0.6666666666666666,\n", - " \"toxicity\": 0.0\n", - " },\n", - " \"response_feedback\": [\n", - " {\n", - " \"type\": \"instruction_adherence_failure\",\n", - " \"source\": \"instruction_adherence\",\n", - " \"instruction\": \"Response should not omit critical details needed to understand or answer the query\",\n", - " \"score\": 0.4688,\n", - " \"explanation\": \"The response omits key details such as eligibility requirements and the need for annual recertification, leaving important information out.\"\n", - " }\n", - " ],\n", - " \"residual_error\": 0.08,\n", - " \"failed_instructions_count\": 1,\n", - " \"stop_reason\": \"instructions_failed_continue_reprompting\",\n", - " \"prompt_template\": \"System:\\n${system_prompt}\\n\\nContext:\\n${context}\\n\\nUser Query:\\n${user_query}\",\n", - " \"response_text\": \"Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under (\"\n", - " },\n", - " {\n", - " \"iteration\": 2,\n", - " \"cumulative_latency_ms\": 105391.15405082703,\n", - " \"scores\": {\n", - " \"groundedness\": 1.0,\n", - " \"instruction_adherence\": 1.0,\n", - " \"toxicity\": 0.0\n", - " },\n", - " \"response_feedback\": [],\n", - " \"residual_error\": 0.0,\n", - " \"failed_instructions_count\": 0,\n", - " \"stop_reason\": \"all_instructions_adhered\",\n", - " \"prompt_template\": \"Original system prompt:\\n${system_prompt}\\n\\nRevise your previous response to this query:\\n${user_query}\\n\\nContext:\\n${context}\\n\\nPrevious response:\\nOkay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under (\\n\\nAlmost there. Just a few small fixes needed.\\n\\nFix the following:\\n\\n1. We are 53.12% confident that the following instruction was not followed:\\n\\u2192 Violated Instruction: \\\"Response should not omit critical details needed to understand or answer the query\\\"\\n\\u2192 Explanation: The response omits key details such as eligibility requirements and the need for annual recertification, leaving important information out.\\n\\n\\nPreserve correct content. Return only the revised output with no extra explanation.\\nYou did well on these instructions. It is important that you continue to follow these instructions:\\n- Avoid overly technical or robotic phrasing; keep the tone human and accessible.\\n- Keep the tone professionally neutral by avoiding emotionally charged words, exclamations, or informal phrases (e.g., awesome, ugh, or emojis)\\n\",\n", - " \"response_text\": \"Okay, I can help you understand the potential forgiveness timeline for your graduate student loans under the Income-Driven Repayment (IDR) plans.\\n\\nHere's a breakdown based on the IDR options available:\\n\\n* **REPAYE and PAYE:** These plans require you to pay 10% of your discretionary income. Forgiveness is granted after **20 years** for undergraduate loans and **25 years** for graduate loans.\\n* **IBR (Income-Based Repayment):** Payments range from 10% to 15% of your discretionary income, depending on when your loans were taken out. Forgiveness is granted after **20 or 25 years**, depending on the loan origination date.\\n* **ICR (Income-Contingent Repayment):** Payments are calculated as the lesser of 20% of your discretionary income or a fixed 12-year repayment amount. Forgiveness is granted after **25 years**.\\n\\n**To determine the specific forgiveness date for your loans, I need a little more information:**\\n\\n1. **What are the origination dates of your graduate loans?** This is crucial because it determines which IBR repayment calculation you fall under.\"\n", - " }\n", - "]\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Conclusion\n", - "\n", - "In this notebook, we demonstrated how to use AIMon’s Re‑prompting Pipeline to automatically improve LLM responses with targeted feedback. We covered:\n", - "\n", - "* Building a `my_llm` function to fill templated prompts and call any LLM.\n", - "\n", - "* Adjusting `RepromptingConfig` to control pipeline behavior.\n", - "\n", - "* Compiling test inputs, including `system_prompt`, `context`, `user_query`, and `user_instructions`.\n", - "\n", - "* Running the pipeline and reviewing the improved outputs.\n", - "\n", - "With these steps, you can integrate re‑prompting into your own workflows to make model outputs more accurate, consistent, and user‑aligned.\n", - "\n", - "For deeper insights into how re‑prompting improves instruction adherence across models and test cases, check out our blog post [link].\n", - "\n", - "To learn more about available configurations and advanced options, visit our documentation [link].\n", - "\n" - ], - "metadata": { - "id": "g7hpVaYWvkTQ" - } - } - ] -} \ No newline at end of file From 2f3d9f174231956e28fccb12ef7c751c06069b92 Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Thu, 31 Jul 2025 12:21:21 -0700 Subject: [PATCH 07/10] Resolving some of Preetam's comments by improving documentation / resolving typos --- aimon/reprompting_api/pipeline.py | 16 ++++++++++------ aimon/reprompting_api/runner.py | 20 +++++++------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/aimon/reprompting_api/pipeline.py b/aimon/reprompting_api/pipeline.py index 13cd7f1..2635001 100644 --- a/aimon/reprompting_api/pipeline.py +++ b/aimon/reprompting_api/pipeline.py @@ -92,15 +92,18 @@ def run(self, system_prompt: str, context: str, user_query: str, user_instructio Process: 1. Build an initial prompt with query, context, and instructions. 2. Call the LLM to generate a response. - 3. Evaluate the response with AIMon detectors. + 3. Evaluate the response with AIMon detectors for instruction adherence, toxicity, and groundedness. + Toxicity and groundedness are always evaluated. If user_instructions are empty / not provided, the + instruction adherence detector is not used. 4. If violations are found, iteratively generate corrective prompts and re-prompt the LLM. - 5. Stop when all instructions are followed or iteration limits are reached. + 5. Stop when all instructions are followed and response has no hallucination or toxicity or when iteration or latency limits are reached. 6. Return the best response (lowest residual error) along with telemetry and a summary if configured. Args: - user_query (str): The user's query or instruction. - context (str): Contextual information to include in the prompt. - user_instructions (list[str]): Instructions the model must follow. + user_query (str): Must be a non-empty string. The user's query or instruction. + context (str): Contextual information to include in the prompt. May be an empty string, but it is recommended to be included. + user_instructions (list[str]): Instructions the model must follow. May be an empty list, but it is highly recommended to be included. + system_prompt (str): A high‑level role or behavior definition for the model. May be an empty string. Returns: dict: @@ -130,7 +133,7 @@ def run(self, system_prompt: str, context: str, user_query: str, user_instructio curr_result = self._detect_aimon_response(curr_payload, self.config.feedback_model_max_retries) logger.debug(f"AIMon evaluation result: {curr_result}") - # Get scores and detailed feedback on failured instructions + # Get scores and detailed feedback on failed instructions scores, feedback = self.get_response_feedback(curr_result) self._record_iteration_output(iteration_outputs, iteration_num, curr_generated_text, curr_result) @@ -158,6 +161,7 @@ def run(self, system_prompt: str, context: str, user_query: str, user_instructio curr_prompt = self._build_corrective_prompt(curr_payload, curr_result) # Retry LLM call with corrective prompt + curr_generated_text = self._call_llm(curr_prompt, self.config.user_model_max_retries) curr_generated_text = self._call_llm(curr_prompt,self.config.user_model_max_retries, system_prompt, context, user_query) # Re-evaluate the new response curr_payload = self._build_aimon_payload(context, user_query, user_instructions, curr_generated_text, system_prompt) diff --git a/aimon/reprompting_api/runner.py b/aimon/reprompting_api/runner.py index 0116cca..4bb072f 100644 --- a/aimon/reprompting_api/runner.py +++ b/aimon/reprompting_api/runner.py @@ -31,19 +31,13 @@ def run_reprompting_pipeline( and `"[no context provided]"`) to ensure template consistency. Args: - llm_fn (Callable[[Template, str, str, str], str]): - A function to call the LLM. Must accept a prompt template (recommended_prompt_template), - `system_prompt`, `context`, and `user_query`. - user_query (str): - The user’s query. Must be a non-empty string. - system_prompt (str, optional): - A system-level instruction string. Defaults to `"[no system prompt provided]"` if None or empty. - context (str, optional): - Supplemental context for the LLM. Defaults to `"[no context provided]"` if None or empty. - user_instructions (List[str], optional): - A list of instructions for the model to follow. Defaults to an empty list. - reprompting_config (RepromptingConfig, optional): - Configuration object for controlling pipeline behavior. + llm_fn (Callable[[Template, str, str, str], str]): A function to call the LLM. Must accept a prompt template (recommended_prompt_template), + `system_prompt`, `context`, and `user_query`. + user_query (str): The user’s query. Must be a non-empty string. + system_prompt (str, optional): A system-level instruction string. Defaults to `"[no system prompt provided]"` if None or empty. + context (str, optional): Supplemental context for the LLM. Defaults to `"[no context provided]"` if None or empty. + user_instructions (List[str], optional): A list of instructions for the model to follow. Defaults to an empty list. + reprompting_config (RepromptingConfig, optional): Configuration object for controlling pipeline behavior. Returns: dict: A structured dictionary containing: From a7f56e86f85fbe62370620adc0956ab265b3696d Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Thu, 31 Jul 2025 12:22:54 -0700 Subject: [PATCH 08/10] moving the tests into https://github.com/aimonlabs/aimon-python-sdk/tree/main/tests --- aimon/reprompting_api/tests/_init_.py | 0 {aimon/reprompting_api/tests => tests}/test_reprompting_cases.py | 0 .../reprompting_api/tests => tests}/test_reprompting_failures.py | 0 .../reprompting_api/tests => tests}/test_reprompting_success.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 aimon/reprompting_api/tests/_init_.py rename {aimon/reprompting_api/tests => tests}/test_reprompting_cases.py (100%) rename {aimon/reprompting_api/tests => tests}/test_reprompting_failures.py (100%) rename {aimon/reprompting_api/tests => tests}/test_reprompting_success.py (100%) diff --git a/aimon/reprompting_api/tests/_init_.py b/aimon/reprompting_api/tests/_init_.py deleted file mode 100644 index e69de29..0000000 diff --git a/aimon/reprompting_api/tests/test_reprompting_cases.py b/tests/test_reprompting_cases.py similarity index 100% rename from aimon/reprompting_api/tests/test_reprompting_cases.py rename to tests/test_reprompting_cases.py diff --git a/aimon/reprompting_api/tests/test_reprompting_failures.py b/tests/test_reprompting_failures.py similarity index 100% rename from aimon/reprompting_api/tests/test_reprompting_failures.py rename to tests/test_reprompting_failures.py diff --git a/aimon/reprompting_api/tests/test_reprompting_success.py b/tests/test_reprompting_success.py similarity index 100% rename from aimon/reprompting_api/tests/test_reprompting_success.py rename to tests/test_reprompting_success.py From de5345c27a27bb9186ab455a5f00225868f0beff Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Thu, 31 Jul 2025 18:15:14 -0700 Subject: [PATCH 09/10] adding exponential backoff retry decorator to utils using the decorator to safely call the user-provided LLM function and the AIMon Detect function. Re-raises last encountered exception upon failure. Also handled 0 division error in get_penalized_average by returning -1 if the list of follow probabilities was empty and added this info to documentation --- aimon/reprompting_api/pipeline.py | 75 ++++++++++++++---------------- aimon/reprompting_api/utils.py | 64 +++++++++++++++++++++++-- tests/test_reprompting_failures.py | 9 ++-- 3 files changed, 101 insertions(+), 47 deletions(-) diff --git a/aimon/reprompting_api/pipeline.py b/aimon/reprompting_api/pipeline.py index 2635001..4e962f2 100644 --- a/aimon/reprompting_api/pipeline.py +++ b/aimon/reprompting_api/pipeline.py @@ -1,7 +1,7 @@ from aimon.reprompting_api.config import RepromptingConfig, StopReasons from aimon.reprompting_api.telemetry import TelemetryLogger from aimon.reprompting_api.reprompter import Reprompter -from aimon.reprompting_api.utils import toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score, get_failed_toxicity_instructions +from aimon.reprompting_api.utils import retry, toxicity_check, get_failed_instructions_count, get_failed_instructions, get_residual_error_score, get_failed_toxicity_instructions from aimon import Detect import time import random @@ -243,57 +243,50 @@ def _build_aimon_payload(self, context, user_query, user_instructions, generated } return payload - def _call_llm(self, prompt_template: Template, max_attempts, system_prompt=None, context=None, user_query=None, base_delay=1): + def _call_llm(self, prompt_template: Template, max_attempts, system_prompt=None, context=None, user_query=None): """ Calls the LLM with exponential backoff. Retries if the LLM call fails - OR returns a non-string value. Raises an exception if all retries fail. + OR returns a non-string value. If all retries fail, the last encountered + exception from the LLM call is re-raised. Args: prompt_template (Template): Prompt template for the LLM. max_attempts (int): Max retry attempts. - base_delay (float): Initial delay in seconds before backoff. - + Returns: str: LLM response text. Raises: - RuntimeError: If the LLM call fails or returns an invalid type after all retries. + RuntimeError: If the LLM call repeatedly fails, re-raises the last encountered error. + TypeError: If the LLM call fails to return a string. """ - last_exception = None - for attempt in range(max_attempts): - try: - logger.debug(f"LLM call attempt {attempt+1} with prompt template.") - result = self.llm_fn(prompt_template, system_prompt, context, user_query) - # Validate type - if not isinstance(result, str): - raise TypeError(f"LLM returned invalid type {type(result).__name__}, expected str.") - return result - except Exception as e: - last_exception = e - logger.warning(f"LLM call failed on attempt {attempt+1}: {e}") - wait_time = base_delay * (2 ** attempt) + random.uniform(0, 0.1) - time.sleep(wait_time) - raise RuntimeError(f"LLM call failed or returned invalid type after maximum retries. Last error: {last_exception}") + @retry(exception_to_check=Exception, tries=max_attempts, delay=1, backoff=2, logger=logger) + def backoff_call(): + result = self.llm_fn(prompt_template, system_prompt, context, user_query) + if not isinstance(result, str): + raise TypeError(f"LLM returned invalid type {type(result).__name__}, expected str.") + return result + return backoff_call() - def _detect_aimon_response(self, payload, max_attempts, base_delay=1): + def _detect_aimon_response(self, payload, max_attempts): """ Calls AIMon Detect with exponential backoff and returns the detection result. This method wraps the AIMon evaluation call, retrying if it fails due to transient errors (e.g., network issues or temporary service unavailability). It retries up to - `max_attempts` times with exponential backoff before raising a RuntimeError. + `max_attempts` times with exponential backoff before raising the last encountered + exception from the AIMon Detect call. Args: payload (dict): A dictionary containing 'context', 'user_query', 'instructions', and 'generated_text' for evaluation. max_attempts (int): Maximum number of retry attempts. - base_delay (float): Initial delay in seconds before exponential backoff. Returns: object: The AIMon detection result containing evaluation scores and feedback. Raises: - RuntimeError: If AIMon Detect fails after all retry attempts. + RuntimeError: If AIMon Detect fails after all retry attempts, re-raises the last encountered error. """ aimon_context = f"{payload['context']}\n\nUser Query:\n{payload['user_query']}" aimon_query = f"{payload['user_query']}\n\nInstructions:\n{payload['instructions']}" @@ -302,21 +295,23 @@ def _detect_aimon_response(self, payload, max_attempts, base_delay=1): def run_detection(query, instructions, generated_text, context): return query, instructions, generated_text, context - for attempt in range(max_attempts): - try: - logger.debug(f"AIMon detect attempt {attempt+1} with payload: {payload}") - _, _, _, _, result = run_detection( - aimon_query, - payload['instructions'], - payload['generated_text'], - aimon_context - ) - return result - except Exception as e: - logger.debug(f"AIMon detect failed on attempt {attempt+1}: {e}") - wait_time = base_delay * (2 ** attempt) + random.uniform(0, 0.1) - time.sleep(wait_time) - raise RuntimeError("AIMon detect call failed after maximum retries.") + @retry( + exception_to_check=Exception, + tries=max_attempts, + delay=1, + backoff=2, + logger=logger + ) + def inner_detection(): + logger.debug(f"AIMon detect call with payload: {payload}") + _, _, _, _, result = run_detection( + aimon_query, + payload['instructions'], + payload['generated_text'], + aimon_context + ) + return result + return inner_detection() def get_response_feedback(self, result): """ diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py index e1144a8..20740fc 100644 --- a/aimon/reprompting_api/utils.py +++ b/aimon/reprompting_api/utils.py @@ -14,7 +14,62 @@ - Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25). - Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures. """ -from typing import List +from typing import Callable, Type, Union, Tuple, Optional, List +from functools import wraps +import logging +import random +import time + +def retry( + exception_to_check: Union[Type[BaseException], Tuple[Type[BaseException], ...]], + tries: int = 5, + delay: int = 3, + backoff: int = 2, + logger: Optional[logging.Logger] = None, + log_level: int = logging.WARNING, + re_raise: bool = True, + jitter: float = 0.1 +) -> Callable: + """ + Retry calling the decorated function using an exponential backoff. + :param exception_to_check: Exception or a tuple of exceptions to check. + :param tries: Number of times to try (not retry) before giving up. + :param delay: Initial delay between retries in seconds. + :param backoff: Backoff multiplier e.g., a value of 2 will double the delay each retry. + :param logger: Logger to use. If None, print. + :param log_level: Logging level. + :param re_raise: Whether to re-raise the exception after the last retry. + :param jitter: The maximum jitter to apply to the delay as a fraction of the delay. + """ + + def deco_retry(func: Callable) -> Callable: + @wraps(func) + def f_retry(*args, **kwargs): + remaining_tries, current_delay = tries, delay + while remaining_tries > 1: + try: + return func(*args, **kwargs) + except exception_to_check as e: + msg = f"{e}, Retrying in {current_delay} seconds..." + if logger: + logger.log(log_level, msg) + else: + print(msg) + time.sleep(current_delay * (1 + jitter * (2 * random.random() - 1))) + remaining_tries -= 1 + current_delay *= backoff + try: + return func(*args, **kwargs) + except exception_to_check as e: + msg = f"Failed after {tries} tries. {e}" + if logger: + logger.log(log_level, msg) + else: + print(msg) + if re_raise: + raise + return f_retry + return deco_retry # toxicity threshold for AIMon detection; Follow probabilities above this are considered failures TOXICITY_THRESHOLD = 0.25 @@ -168,11 +223,14 @@ def penalized_average(probs: List[float]) -> float: Probabilities > 0.5 (passed instructions) recieve no penalty Args: - probs (List[float]): A list of follow probabilities. + probs (List[float]): A list of follow probabilities. Must be non-empty. Returns: - float: Penalized average. + float: Penalized average. Return -1 if probs is empty. """ + if not probs: # handle division by zero for empty list + return -1 + penalties = [] for p in probs: if p >= 0.5: diff --git a/tests/test_reprompting_failures.py b/tests/test_reprompting_failures.py index aacc764..acf9511 100644 --- a/tests/test_reprompting_failures.py +++ b/tests/test_reprompting_failures.py @@ -2,6 +2,7 @@ import pytest from string import Template from together import Together +import aimon from aimon.reprompting_api.config import RepromptingConfig from aimon.reprompting_api.runner import run_reprompting_pipeline @@ -61,7 +62,7 @@ def get_config_with_invalid_aimon_api_key(): def test_llm_failure(): """Should raise RuntimeError when the LLM function always fails.""" config = get_config() - with pytest.raises(RuntimeError, match="LLM call failed or returned invalid type after maximum retries."): + with pytest.raises(RuntimeError, match="LLM call failed intentionally for testing"): run_reprompting_pipeline( user_query="Test LLM failure handling", context="Context for failure test", @@ -85,9 +86,9 @@ def test_invalid_llm_fn(): @pytest.mark.integration def test_invalid_return_value(): - """Should raise RuntimeError when the LLM returns a non-string value.""" + """Should raise TypeError when the LLM returns a non-string value.""" config = get_config() - with pytest.raises(RuntimeError, match="LLM call failed or returned invalid type"): + with pytest.raises(TypeError, match="LLM returned invalid type int, expected str."): run_reprompting_pipeline( user_query="Test invalid return type", context="Context for type error", @@ -113,7 +114,7 @@ def test_empty_query(): def test_invalid_api_key(): """Should fail due to invalid AIMon API key.""" config = get_config_with_invalid_aimon_api_key() - with pytest.raises(RuntimeError): + with pytest.raises(aimon.AuthenticationError): run_reprompting_pipeline( user_query="Testing with invalid AIMon API key", context="Context for invalid key test", From cec997535d40f5863eb913644f13dbabf757384b Mon Sep 17 00:00:00 2001 From: ashah-aanya Date: Thu, 31 Jul 2025 18:45:51 -0700 Subject: [PATCH 10/10] removing TogetherAI integration in tests and using a dummy LLM response for pytest --- tests/test_reprompting_cases.py | 24 +------ tests/test_reprompting_failures.py | 20 +----- tests/test_reprompting_success.py | 105 ----------------------------- 3 files changed, 5 insertions(+), 144 deletions(-) delete mode 100644 tests/test_reprompting_success.py diff --git a/tests/test_reprompting_cases.py b/tests/test_reprompting_cases.py index 5dfa5ea..b913bf6 100644 --- a/tests/test_reprompting_cases.py +++ b/tests/test_reprompting_cases.py @@ -1,37 +1,23 @@ import os import pytest from string import Template -from together import Together from aimon.reprompting_api.config import RepromptingConfig from aimon.reprompting_api.runner import run_reprompting_pipeline -TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") AIMON_API_KEY = os.environ.get("AIMON_API_KEY") -client = Together(api_key=TOGETHER_API_KEY) - # --- Fixtures --- @pytest.fixture def my_llm(): """Mock LLM function for integration tests. Prints prompts and responses.""" def _my_llm(recommended_prompt_template: Template, system_prompt, context, user_query) -> str: - filled_prompt = recommended_prompt_template.substitute( + filled_prompt = recommended_prompt_template.safe_substitute( system_prompt=system_prompt or "", context=context or "", user_query=user_query or "" ) - print("\n==== LLM PROMPT SENT ====", flush=True) - print(filled_prompt, flush=True) - response = client.chat.completions.create( - model="mistralai/Mistral-7B-Instruct-v0.2", - messages=[{"role": "user", "content": filled_prompt}], - max_tokens=256, - temperature=0 - ) - print("\n==== LLM RAW RESPONSE ====", flush=True) - print(response.choices[0].message.content, flush=True) - return response.choices[0].message.content + return filled_prompt return _my_llm @pytest.fixture @@ -104,7 +90,6 @@ def print_result(test_name, result): # --- Tests --- -@pytest.mark.integration def test_low_latency_limit(my_llm, config_low_latency): """Test stopping behavior when latency limit is very low (100ms).""" result = run_reprompting_pipeline( @@ -117,7 +102,6 @@ def test_low_latency_limit(my_llm, config_low_latency): print_result("Low Latency Limit Test (100ms)", result) assert "best_response" in result -@pytest.mark.integration def test_latency_limit(my_llm, config_high_latency): """Test behavior with a high latency limit and contradictory instructions.""" result = run_reprompting_pipeline( @@ -130,7 +114,6 @@ def test_latency_limit(my_llm, config_high_latency): print_result("High Latency Limit Test (5000ms)", result) assert "best_response" in result -@pytest.mark.integration def test_iteration_limit(my_llm, config_iteration_limit): """Test behavior when max_iterations is 1.""" result = run_reprompting_pipeline( @@ -144,7 +127,6 @@ def test_iteration_limit(my_llm, config_iteration_limit): print_result("Iteration Limit Test (no re-prompting, only 1 iteration allowed)", result) assert "best_response" in result -@pytest.mark.integration def test_empty_context_and_instructions(my_llm, base_config): """Ensure pipeline works with no context, instructions, or system prompt.""" result = run_reprompting_pipeline( @@ -157,7 +139,6 @@ def test_empty_context_and_instructions(my_llm, base_config): print_result("Empty Context & Instructions Test", result) assert "best_response" in result -@pytest.mark.integration def test_no_telemetry(my_llm, config_without_telemetry): """Confirm telemetry and summary are excluded when disabled in config.""" result = run_reprompting_pipeline( @@ -171,7 +152,6 @@ def test_no_telemetry(my_llm, config_without_telemetry): assert "telemetry" not in result assert "summary" not in result -@pytest.mark.integration def test_no_system_prompt(my_llm, base_config): """Test behavior when system prompt is excluded.""" result = run_reprompting_pipeline( diff --git a/tests/test_reprompting_failures.py b/tests/test_reprompting_failures.py index acf9511..df5ca9b 100644 --- a/tests/test_reprompting_failures.py +++ b/tests/test_reprompting_failures.py @@ -1,30 +1,21 @@ import os import pytest from string import Template -from together import Together import aimon from aimon.reprompting_api.config import RepromptingConfig from aimon.reprompting_api.runner import run_reprompting_pipeline -TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") AIMON_API_KEY = os.environ.get("AIMON_API_KEY") -client = Together(api_key=TOGETHER_API_KEY) # --- MOCKED LLM FUNCTIONS --- def my_llm(prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: - """Simulates a normal working LLM that returns a string response.""" - filled_prompt = prompt_template.substitute( + """Simulates a normal working LLM that returns a string response. Just returns filled_prompt for test""" + filled_prompt = prompt_template.safe_substitute( system_prompt=system_prompt or "", context=context or "", user_query=user_query or "" ) - response = client.chat.completions.create( - model="mistralai/Mistral-7B-Instruct-v0.2", - messages=[{"role": "user", "content": filled_prompt}], - max_tokens=256, - temperature=0 - ) - return response.choices[0].message.content + return filled_prompt def llm_fn_failure(prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: """Simulates an LLM call that fails every time.""" @@ -58,7 +49,6 @@ def get_config_with_invalid_aimon_api_key(): ) # --- TESTS EXPECTING FAILURES --- -@pytest.mark.integration def test_llm_failure(): """Should raise RuntimeError when the LLM function always fails.""" config = get_config() @@ -71,7 +61,6 @@ def test_llm_failure(): user_instructions=[] ) -@pytest.mark.integration def test_invalid_llm_fn(): """Should raise TypeError when LLM function is None.""" config = get_config() @@ -84,7 +73,6 @@ def test_invalid_llm_fn(): user_instructions=[] ) -@pytest.mark.integration def test_invalid_return_value(): """Should raise TypeError when the LLM returns a non-string value.""" config = get_config() @@ -97,7 +85,6 @@ def test_invalid_return_value(): user_instructions=[] ) -@pytest.mark.integration def test_empty_query(): """Empty query should raise a ValueError.""" config = get_config() @@ -110,7 +97,6 @@ def test_empty_query(): user_instructions=[] ) -@pytest.mark.integration def test_invalid_api_key(): """Should fail due to invalid AIMon API key.""" config = get_config_with_invalid_aimon_api_key() diff --git a/tests/test_reprompting_success.py b/tests/test_reprompting_success.py deleted file mode 100644 index 5a6d2e6..0000000 --- a/tests/test_reprompting_success.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import logging -from string import Template -from together import Together -from aimon.reprompting_api.config import RepromptingConfig -from aimon.reprompting_api.runner import run_reprompting_pipeline - -# --- Configure logging --- -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") -logger = logging.getLogger(__name__) - -# --- Load API keys --- -TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") -AIMON_API_KEY = os.environ.get("AIMON_API_KEY") -if not TOGETHER_API_KEY or not AIMON_API_KEY: - logger.warning("API keys are missing. Make sure TOGETHER_API_KEY and AIMON_API_KEY are set.") - -# --- Initialize Together client --- -client = Together(api_key=TOGETHER_API_KEY) - -# --- LLM Function --- -def my_llm(recommended_prompt_template: Template, system_prompt=None, context=None, user_query=None) -> str: - """ - Example LLM function that: - 1. Receives a corrective prompt template (string.Template). - 2. Substitutes placeholders (system_prompt, context, user_query). - 3. Sends to a Together-hosted LLM and returns the response. - """ - filled_prompt = recommended_prompt_template.substitute( - system_prompt=system_prompt or "", - context=context or "", - user_query=user_query or "" - ) - logger.info(f"[LLM] Sending prompt to model: {filled_prompt[:200]}...") # Log preview - - response = client.chat.completions.create( - model="mistralai/Mistral-7B-Instruct-v0.2", - messages=[{"role": "user", "content": filled_prompt}], - max_tokens=256, - temperature=0 - ) - output = response.choices[0].message.content - logger.info(f"[LLM] Received response: {output[:200]}...") - return output - -# --- Test Case: Successful Run --- -def test_successful_run(): - """ - Simulates a realistic pipeline run with: - - Complex context - - Query for simplification - - Multiple style/tone instructions - - Telemetry & summary enabled - """ - logger.info("[Pipeline] Starting test run...") - - config = RepromptingConfig( - aimon_api_key=AIMON_API_KEY, - publish=True, - return_telemetry=True, - return_aimon_summary=True, - application_name="api_test", - max_iterations=3 - ) - logger.info("[Pipeline] Config prepared.") - - user_query = "What are the drug tiers?" - context = "[SECTION] 📘 BlueShield Rx Policy Addendum: 2023–2025 Commercial & Employer-Sponsored Plans [SECTION] Confidential – Not for external dissemination without compliance review. [SECTION] 🔹 Section 2.1.7 – Drug Coverage Eligibility Matrix [SECTION] Prescription drug eligibility is governed by a tiered, multi-variant benefit design informed by annual P&T Committee decisions, manufacturer rebates, CMS Part D benchmarking (when applicable), and employer-specific customizations. The following formulary tiers apply unless superseded by a group rider or conditional override: [SECTION] - **Tier 1 (Generic Core):** Includes FDA-approved AB-rated generics; requires no PA or ST, unless the member is flagged under the Risk Management Tier Hold (RMTH) protocol due to prior misuse. [SECTION] - **Tier 2 (Preferred Brand & Enhanced Generics):** Coverage dependent on documented trial/failure of Tier 1 alternatives unless contraindicated. Members in the Legacy Bridge plan must obtain both prescriber attestation and pharmacy alignment verification. [SECTION] - **Tier 3 (Non-Preferred & Specialty Entry):** May require dual-layer review if member has not met chronic condition enrollment criteria (CCE) in the last benefit year. Tier migration possible mid-cycle based on new formulary rules. [SECTION] - **Tier 4 (Specialty Injectables, Biologics, and Condition-Limited Agents):** Includes drugs subject to clinical pathway alignment; claims must be adjudicated through the PBM’s split-fulfillment logic unless the prescribing entity is credentialed as Tier 4-A. [SECTION] 🚫 Exception: Certain biosimilars classified under Tier 4 in national formularies may be covered at Tier 2 if dispensed under limited-distribution contracts, provided the prescribing facility participates in the 340B program **and** the member is flagged under Enhanced Affordability Priority (EAP). [SECTION] 🔁 **Prior Authorization (PA) Layering Logic** [SECTION] Drugs requiring PA are subject to a three-stage filter: [SECTION] 1. **Therapeutic Criteria Review (TCR)** – Clinical alignment with diagnosis and formulary path. [SECTION] 2. **Coverage Policy Sync (CPS)** – Matches requested use with plan sponsor coverage schema. [SECTION] 3. **Utilization Watch Flag (UWF)** – If triggered, a third-party medical director review is initiated (adds 2–4 business days). [SECTION] 💡 Exemplar: *Trulicity* (GLP-1 receptor agonist) [SECTION] - **Base Tier:** Tier 3 across most commercial plans [SECTION] - **Override Possibility:** Auto-lifts to Tier 2 under Metabolic Risk Bundling if member is concurrently enrolled in cardiac risk management AND insulin titration modules. [SECTION] - **Caveat:** Auto-injector version may still trigger UWF if prescribed without 90-day adherence documentation to metformin or contraindication to semaglutide. [SECTION] 🗂️ **Adjudication Complexity Notes** [SECTION] - Fill attempts at non-network or out-of-state pharmacies may default to full retail pricing, even if coverage is active. [SECTION] - Certain maintenance tier drugs can only be filled at 90-day intervals after two successful 30-day fills unless dispensed via SmartSync (auto-align refill system). [SECTION] - Claims using discount cards (e.g., manufacturer copay assistance) will not count toward deductible or out-of-pocket limits unless the pharmacy submits a Coordinated Adjudication Adjustment Request (CAAR). [SECTION] ⚠️ **Denials & Appeals** [SECTION] - If PA is denied, appeals must cite new clinical rationale. Re-submission of identical documentation will be auto-denied. [SECTION] - Members in Tier Restructuring Delay (TRD) periods due to employer override cannot file external appeals unless the drug is life-sustaining and not replaceable under Tier 1/2. [SECTION] - Denials on non-formulary drugs are not eligible for Tier Transition Program (TTP) unless covered during prior plan year with no lapse in coverage >30 days. [SECTION] 📊 **Plan Differences** [SECTION] - Standard, Enhanced, Platinum, and Concierge tiers each have different deductible-accumulation thresholds and copay structures. [SECTION] - For Platinum+ plans, Tier 3 copay is waived on first-time fills initiated post-discharge from an inpatient episode if coded using post-acute NDCs. [SECTION] 📣 Misc. Clarifications [SECTION] - The “Healthy Living Rewards” program, mentioned in new member packets, does not affect coverage or drug tier placement. It is a wellness initiative only. [SECTION] - Benefit year resets on Jan 1, but tier realignment occurs quarterly and may retroactively affect claims filled in the trailing 45-day buffer period. [SECTION] 🔒 REMINDER: Member Services guidance may reflect outdated tier assignments if formulary refreshes are in progress. Online lookup tools update in real time and take precedence during adjudication disputes." - system_prompt = "You are a knowledgeable but approachable healthcare benefits assistant. Your role is to help users understand BlueShield prescription drug policies by explaining terms and tiers in simple, clear, and user‑friendly language. Always prioritize accuracy and clarity over technical jargon." - user_instructions = [ - "Avoid overly technical or robotic phrasing; keep the tone human and accessible.", - "Ensure the response is direct and professional, with minimal informal tone.", - "Translate or simplify technical details from the context into accurate, user-friendly explanations." - ] - logger.info(f"[Pipeline] User query: {user_query}") - logger.info(f"[Pipeline] Context: {context[:100]}...") - logger.info(f"[Pipeline] Instructions: {user_instructions}") - - # Run pipeline - result = run_reprompting_pipeline( - llm_fn=my_llm, - user_query=user_query, - system_prompt= system_prompt, - context=context, - user_instructions=user_instructions, - reprompting_config=config - ) - - # Log each part of the result - logger.info("[Pipeline] Run complete.") - logger.info(f"[Pipeline] Best Response: {result['best_response']}") - logger.info(f"[Pipeline] Telemetry: {result.get('telemetry')}") - logger.info(f"[Pipeline] Summary: {result.get('summary')}") - - # Print outputs for inspection - print("\n=== BEST RESPONSE ===") - print(result["best_response"]) - print("\n=== TELEMETRY ===") - print(result.get("telemetry")) - print("\n=== SUMMARY ===") - print(result.get("summary")) - -# --- Entry Point --- -if __name__ == "__main__": - test_successful_run()