Corrected toxicity intepretation based on the new toxicity model beha… (#72)

pjoshi30 · Preetam Joshi · web-flow · commit b76340c63215 · 2025-10-13T16:54:05.000-07:00
* Corrected toxicity intepretation based on the new toxicity model behavior

* Bumping version of the package

* Corrected doc string for ttoxicity

* Added unit tests for reprompting utils

* Updated tests and fixed a few bugs

* Fixing tests

---------

Co-authored-by: Preetam Joshi &lt;info@aimon.ai&gt;
diff --git a/aimon/_version.py b/aimon/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "aimon"
-__version__ = "0.12.1"
+__version__ = "0.12.2"
diff --git a/aimon/reprompting_api/reprompter.py b/aimon/reprompting_api/reprompter.py
@@ -84,6 +84,9 @@ def get_toxicity_reprompt(self, result) -> str:
 
         Returns:
             str: Toxicity-specific feedback, or None if no toxicity detected.
+        
+        Note:
+            For toxicity, lower scores indicate higher toxicity. We invert the score to show confidence.
         """
         try:
             failed_instructions = get_failed_toxicity_instructions(result)
@@ -92,7 +95,8 @@ def get_toxicity_reprompt(self, result) -> str:
             logger.info(f"Toxicity violations detected: {len(failed_instructions)}")
             lines = ["Your reply contained toxic content. Remove any harmful, abusive, or unsafe language."]
             for i, failed_instruction in enumerate(failed_instructions, start=1):
-                confidence = failed_instruction.get("score", 0.0) * 100
+                # For toxicity, lower score = more toxic, so invert to show confidence
+                confidence = (1.0 - failed_instruction.get("score", 0.0)) * 100
                 confidence_str = f"{confidence:.2f}%"
                 lines.append(
                     f"{i}. We are {confidence_str} confident that your response had the following issue:\n"
diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py
@@ -11,7 +11,7 @@
 - Guide corrective re-prompting logic.
 
 Key conventions:
-- Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25).
+- Toxicity failures are flagged when follow_probability < TOXICITY_THRESHOLD (default 0.5). Lower scores indicate higher toxicity.
 - Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures.
 """
 from typing import Callable, Type, Union, Tuple, Optional, List
@@ -71,12 +71,13 @@ def f_retry(*args, **kwargs):
         return f_retry
     return deco_retry
 
-# toxicity threshold for AIMon detection; Follow probabilities above this are considered failures
-TOXICITY_THRESHOLD = 0.25
+# toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic)
+TOXICITY_THRESHOLD = 0.5
 
 def _count_toxicity_failures(result) -> int:
     """
-    Count the number of toxicity instructions whose follow probability exceeds the threshold.
+    Count the number of toxicity instructions whose follow probability is below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
@@ -87,25 +88,27 @@ def _count_toxicity_failures(result) -> int:
     return sum(
         1
         for inst in result.detect_response.toxicity.get("instructions_list", [])
-        if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD
+        if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD
     )
 
 def toxicity_check(result) -> bool:
     """
-    Check whether any toxicity instructions exceed the threshold.
+    Check whether any toxicity instructions fall below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
 
     Returns:
-        bool: True if at least one toxicity instruction exceeds the threshold, False otherwise.
+        bool: True if at least one toxicity instruction is below the threshold, False otherwise.
     """
     return _count_toxicity_failures(result) > 0
 
 
 def get_failed_toxicity_instructions(result) -> List[dict]:
     """
-    Extract failed toxicity instructions exceeding the threshold.
+    Extract failed toxicity instructions below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
@@ -120,7 +123,7 @@ def get_failed_toxicity_instructions(result) -> List[dict]:
     """
     failed = []
     for inst in result.detect_response.toxicity.get("instructions_list", []):
-        if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD:
+        if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD:
             failed.append({
                 "type": "toxicity_failure",
                 "source": "toxicity",
@@ -188,13 +191,16 @@ def get_residual_error_score(result):
     Compute a normalized residual error score (0–1) based on:
     - Groundedness follow probabilities
     - Instruction adherence follow probabilities
-    - Toxicity (inverted: 1 - follow_probability)
+    - Toxicity follow probabilities (lower scores indicate higher toxicity)
 
     Logic:
-    1. Collect follow probabilities for groundedness & adherence.
-    2. For toxicity, use 1 - follow_probability (since high follow = low error).
+    1. Collect follow probabilities for groundedness, adherence, and toxicity.
+    2. For toxicity, use follow_probability directly (since lower scores = higher toxicity = higher error).
     3. Compute a penalized average using the helper.
     4. Clamp the final score to [0,1].
+    
+    Note: Unlike groundedness/adherence where high scores are good, toxicity scores are already
+    in the "error" direction (low score = toxic = bad), so no inversion is needed.
     """
     combined_probs = []
 
@@ -204,9 +210,9 @@ def get_residual_error_score(result):
             for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
         ])
 
-    # For toxicity, invert the follow probability
+    # For toxicity, use the follow probability directly (lower = more toxic = higher error)
     combined_probs.extend([
-        1 - item["follow_probability"]
+        item["follow_probability"]
         for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", [])
     ])
 
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     name='aimon',
     python_requires='>3.8.0',
     packages=find_packages(),
-    version="0.12.1",
+    version="0.12.2",
     install_requires=[
         "annotated-types~=0.6.0",
         "anyio~=4.9.0",
diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -39,8 +39,8 @@ def log_info(self, title, data):
 
     def test_basic_detect_functionality(self, caplog):
         """Test that the Detect decorator works with basic functionality without raising exceptions."""
-        # Create the decorator
-        config = {'hallucination': {'detector_name': 'default'}}
+        # Create the decorator (using groundedness instead of deprecated hallucination)
+        config = {'groundedness': {'detector_name': 'default'}}
         values_returned = ["context", "generated_text", "user_query"]
         
         self.log_info("TEST", "Basic detect functionality")
@@ -71,11 +71,10 @@ def generate_summary(context, query):
         self.log_info("OUTPUT_GENERATED_TEXT", generated_text)
         self.log_info("OUTPUT_STATUS", result.status)
         
-        if hasattr(result.detect_response, 'hallucination'):
-            self.log_info("OUTPUT_HALLUCINATION", {
-                "is_hallucinated": result.detect_response.hallucination.get("is_hallucinated", ""),
-                "score": result.detect_response.hallucination.get("score", ""),
-                "sentences_count": len(result.detect_response.hallucination.get("sentences", []))
+        if hasattr(result.detect_response, 'groundedness'):
+            self.log_info("OUTPUT_GROUNDEDNESS", {
+                "score": result.detect_response.groundedness.get("score", ""),
+                "instructions_list": result.detect_response.groundedness.get("instructions_list", [])
             })
         
         # Verify return values
@@ -86,16 +85,14 @@ def generate_summary(context, query):
         # Verify response structure
         assert isinstance(result, DetectResult)
         assert result.status == 200
-        assert hasattr(result.detect_response, 'hallucination')
-        assert "is_hallucinated" in result.detect_response.hallucination
-        assert "score" in result.detect_response.hallucination
-        assert "sentences" in result.detect_response.hallucination
+        assert hasattr(result.detect_response, 'groundedness')
+        assert "score" in result.detect_response.groundedness
 
     def test_detect_with_multiple_detectors(self):
         """Test the Detect decorator with multiple detectors without raising exceptions."""
-        # Create the decorator with multiple detectors
+        # Create the decorator with multiple detectors (using groundedness instead of deprecated hallucination)
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'instruction_adherence': {'detector_name': 'default'},
             'toxicity': {'detector_name': 'default'}
         }
@@ -131,25 +128,25 @@ def generate_response(context, query, instructions):
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        for detector in ['hallucination', 'instruction_adherence', 'toxicity']:
+        for detector in ['groundedness', 'instruction_adherence', 'toxicity']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
                                 getattr(result.detect_response, detector))
         
         # Verify response structure
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'instruction_adherence')
         assert hasattr(result.detect_response, 'toxicity')
         
         # Check key fields without verifying values
-        assert "score" in result.detect_response.hallucination
+        assert "score" in result.detect_response.groundedness
         assert "instructions_list" in result.detect_response.instruction_adherence
         assert "score" in result.detect_response.toxicity
 
     def test_detect_with_different_iterables(self):
         """Test the Detect decorator with different iterable types for values_returned."""
         # Create the decorator with a tuple for values_returned
-        config = {'hallucination': {'detector_name': 'default'}}
+        config = {'groundedness': {'detector_name': 'default'}}
         values_returned = ("context", "generated_text")
         
         self.log_info("Test", "Detect with different iterables (tuple)")
@@ -176,16 +173,16 @@ def simple_function():
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        if hasattr(result.detect_response, 'hallucination'):
-            self.log_info("Output - Hallucination Response", 
-                          result.detect_response.hallucination)
+        if hasattr(result.detect_response, 'groundedness'):
+            self.log_info("Output - Groundedness Response", 
+                          result.detect_response.groundedness)
         
         # Verify return values and structure
         assert "Python" in context
         assert "data science" in generated_text
         assert isinstance(result, DetectResult)
-        assert hasattr(result.detect_response, 'hallucination')
-        assert "score" in result.detect_response.hallucination
+        assert hasattr(result.detect_response, 'groundedness')
+        assert "score" in result.detect_response.groundedness
 
     def test_detect_with_non_tuple_return(self):
         """Test the Detect decorator when the wrapped function returns a single value."""
@@ -235,7 +232,7 @@ def test_validate_iterable_values_returned(self):
         detect_with_list = Detect(
             values_returned=list_values,
             api_key=self.api_key,
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # Test with a tuple
@@ -245,7 +242,7 @@ def test_validate_iterable_values_returned(self):
         detect_with_tuple = Detect(
             values_returned=tuple_values,
             api_key=self.api_key, 
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # Test with a custom iterable
@@ -266,7 +263,7 @@ def __len__(self):
         detect_with_custom = Detect(
             values_returned=custom_iterable,
             api_key=self.api_key,
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # If we got here without exceptions, the test passes
@@ -380,7 +377,7 @@ def test_missing_required_fields(self):
                 values_returned=["context", "generated_text"],
                 api_key=self.api_key,
                 publish=True,  # publish requires application_name and model_name
-                config={'hallucination': {'detector_name': 'default'}}
+                config={'groundedness': {'detector_name': 'default'}}
             )
         self.log_info("Error message (publish)", str(exc_info1.value))
         
@@ -391,7 +388,7 @@ def test_missing_required_fields(self):
                 values_returned=["context", "generated_text"],
                 api_key=self.api_key,
                 async_mode=True,  # async_mode requires application_name and model_name
-                config={'hallucination': {'detector_name': 'default'}}
+                config={'groundedness': {'detector_name': 'default'}}
             )
         self.log_info("Error message (async_mode)", str(exc_info2.value))
         
@@ -434,15 +431,15 @@ def generate_text():
         assert hasattr(result.detect_response, 'toxicity')
         assert "score" in result.detect_response.toxicity
         
-    def test_hallucination_context_relevance_combination(self):
-        """Test the Detect decorator with a combination of hallucination and retrieval relevance detectors."""
+    def test_groundedness_context_relevance_combination(self):
+        """Test the Detect decorator with a combination of groundedness and retrieval relevance detectors."""
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'retrieval_relevance': {'detector_name': 'default'}
         }
         values_returned = ["context", "generated_text", "user_query", "task_definition"]
         
-        self.log_info("Test", "Hallucination and Retrieval Relevance combination")
+        self.log_info("Test", "Groundedness and Retrieval Relevance combination")
         self.log_info("Configuration", config)
         self.log_info("Values returned", values_returned)
         
@@ -469,15 +466,15 @@ def generate_summary(context, query):
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        for detector in ['hallucination', 'retrieval_relevance']:
+        for detector in ['groundedness', 'retrieval_relevance']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
                               getattr(result.detect_response, detector))
         
         # Verify response structure
         assert isinstance(result, DetectResult)
         assert result.status == 200
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'retrieval_relevance')
 
     def test_instruction_adherence_v1(self):
@@ -593,7 +590,7 @@ def generate_with_instructions(context, instructions, query):
     def test_all_detectors_combination(self):
         """Test the Detect decorator with all available detectors."""
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'toxicity': {'detector_name': 'default'},
             'instruction_adherence': {'detector_name': 'default'},
             'retrieval_relevance': {'detector_name': 'default'},
@@ -637,7 +634,7 @@ def comprehensive_response(context, query, instructions):
         self.log_info("Output - Status", result.status)
         
         # Log all detector responses
-        for detector in ['hallucination', 'toxicity', 'instruction_adherence', 
+        for detector in ['groundedness', 'toxicity', 'instruction_adherence', 
                         'retrieval_relevance', 'conciseness', 'completeness']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
@@ -648,7 +645,7 @@ def comprehensive_response(context, query, instructions):
         assert result.status == 200
         
         # Verify all detectors are present in the response
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'toxicity')
         assert hasattr(result.detect_response, 'instruction_adherence')
         assert hasattr(result.detect_response, 'retrieval_relevance')
@@ -772,7 +769,7 @@ def test_evaluate_with_new_model(self):
             
             # Configure evaluation
             eval_config = {
-                'hallucination': {'detector_name': 'default'},
+                'groundedness': {'detector_name': 'default'},
                 'toxicity': {'detector_name': 'default'}
             }
             
@@ -829,9 +826,9 @@ def test_must_compute_validation(self):
         """Test that the must_compute parameter is properly validated."""
         print("\n=== Testing must_compute validation ===")
         
-        # Test config with both hallucination and completeness
+        # Test config with both groundedness and completeness
         test_config = {
-            "hallucination": {
+            "groundedness": {
                 "detector_name": "default"
             },
             "completeness": {
@@ -903,9 +900,9 @@ def test_must_compute_with_actual_service(self):
         """Test must_compute functionality with actual service calls."""
         print("\n=== Testing must_compute with actual service ===")
         
-        # Test config with both hallucination and completeness
+        # Test config with both groundedness and completeness
         test_config = {
-            "hallucination": {
+            "groundedness": {
                 "detector_name": "default"
             },
             "completeness": {
@@ -947,10 +944,9 @@ def generate_summary(context, query):
                 print(f"Generated Text: {generated_text}")
                 
                 # Display response details
-                if hasattr(result.detect_response, 'hallucination'):
-                    hallucination = result.detect_response.hallucination
-                    print(f"Hallucination Score: {hallucination.get('score', 'N/A')}")
-                    print(f"Is Hallucinated: {hallucination.get('is_hallucinated', 'N/A')}")
+                if hasattr(result.detect_response, 'groundedness'):
+                    groundedness = result.detect_response.groundedness
+                    print(f"Groundedness Score: {groundedness.get('score', 'N/A')}")
                 
                 if hasattr(result.detect_response, 'completeness'):
                     completeness = result.detect_response.completeness
diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py