Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aimon/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

__title__ = "aimon"
__version__ = "0.12.1"
__version__ = "0.12.2"
6 changes: 5 additions & 1 deletion aimon/reprompting_api/reprompter.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ def get_toxicity_reprompt(self, result) -> str:

Returns:
str: Toxicity-specific feedback, or None if no toxicity detected.

Note:
For toxicity, lower scores indicate higher toxicity. We invert the score to show confidence.
"""
try:
failed_instructions = get_failed_toxicity_instructions(result)
Expand All @@ -92,7 +95,8 @@ def get_toxicity_reprompt(self, result) -> str:
logger.info(f"Toxicity violations detected: {len(failed_instructions)}")
lines = ["Your reply contained toxic content. Remove any harmful, abusive, or unsafe language."]
for i, failed_instruction in enumerate(failed_instructions, start=1):
confidence = failed_instruction.get("score", 0.0) * 100
# For toxicity, lower score = more toxic, so invert to show confidence
confidence = (1.0 - failed_instruction.get("score", 0.0)) * 100
confidence_str = f"{confidence:.2f}%"
lines.append(
f"{i}. We are {confidence_str} confident that your response had the following issue:\n"
Expand Down
34 changes: 20 additions & 14 deletions aimon/reprompting_api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- Guide corrective re-prompting logic.

Key conventions:
- Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25).
- Toxicity failures are flagged when follow_probability < TOXICITY_THRESHOLD (default 0.5). Lower scores indicate higher toxicity.
- Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures.
"""
from typing import Callable, Type, Union, Tuple, Optional, List
Expand Down Expand Up @@ -71,12 +71,13 @@ def f_retry(*args, **kwargs):
return f_retry
return deco_retry

# toxicity threshold for AIMon detection; Follow probabilities above this are considered failures
TOXICITY_THRESHOLD = 0.25
# toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic)
TOXICITY_THRESHOLD = 0.5

def _count_toxicity_failures(result) -> int:
"""
Count the number of toxicity instructions whose follow probability exceeds the threshold.
Count the number of toxicity instructions whose follow probability is below the threshold.
Lower scores indicate higher toxicity.

Args:
result: AIMon detection result containing a `toxicity` section.
Expand All @@ -87,25 +88,27 @@ def _count_toxicity_failures(result) -> int:
return sum(
1
for inst in result.detect_response.toxicity.get("instructions_list", [])
if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD
if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD
)

def toxicity_check(result) -> bool:
"""
Check whether any toxicity instructions exceed the threshold.
Check whether any toxicity instructions fall below the threshold.
Lower scores indicate higher toxicity.

Args:
result: AIMon detection result containing a `toxicity` section.

Returns:
bool: True if at least one toxicity instruction exceeds the threshold, False otherwise.
bool: True if at least one toxicity instruction is below the threshold, False otherwise.
"""
return _count_toxicity_failures(result) > 0


def get_failed_toxicity_instructions(result) -> List[dict]:
"""
Extract failed toxicity instructions exceeding the threshold.
Extract failed toxicity instructions below the threshold.
Lower scores indicate higher toxicity.

Args:
result: AIMon detection result containing a `toxicity` section.
Expand All @@ -120,7 +123,7 @@ def get_failed_toxicity_instructions(result) -> List[dict]:
"""
failed = []
for inst in result.detect_response.toxicity.get("instructions_list", []):
if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD:
if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD:
failed.append({
"type": "toxicity_failure",
"source": "toxicity",
Expand Down Expand Up @@ -188,13 +191,16 @@ def get_residual_error_score(result):
Compute a normalized residual error score (0–1) based on:
- Groundedness follow probabilities
- Instruction adherence follow probabilities
- Toxicity (inverted: 1 - follow_probability)
- Toxicity follow probabilities (lower scores indicate higher toxicity)

Logic:
1. Collect follow probabilities for groundedness & adherence.
2. For toxicity, use 1 - follow_probability (since high follow = low error).
1. Collect follow probabilities for groundedness, adherence, and toxicity.
2. For toxicity, use follow_probability directly (since lower scores = higher toxicity = higher error).
3. Compute a penalized average using the helper.
4. Clamp the final score to [0,1].

Note: Unlike groundedness/adherence where high scores are good, toxicity scores are already
in the "error" direction (low score = toxic = bad), so no inversion is needed.
"""
combined_probs = []

Expand All @@ -204,9 +210,9 @@ def get_residual_error_score(result):
for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
])

# For toxicity, invert the follow probability
# For toxicity, use the follow probability directly (lower = more toxic = higher error)
combined_probs.extend([
1 - item["follow_probability"]
item["follow_probability"]
for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", [])
])

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
name='aimon',
python_requires='>3.8.0',
packages=find_packages(),
version="0.12.1",
version="0.12.2",
install_requires=[
"annotated-types~=0.6.0",
"anyio~=4.9.0",
Expand Down
Loading
Loading