@@ -96,7 +96,7 @@ def test_detect_with_multiple_detectors(self):
9696 # Create the decorator with multiple detectors
9797 config = {
9898 'hallucination' : {'detector_name' : 'default' },
99- 'instruction_adherence' : {'detector_name' : 'v1 ' },
99+ 'instruction_adherence' : {'detector_name' : 'default ' },
100100 'toxicity' : {'detector_name' : 'default' }
101101 }
102102 values_returned = ["context" , "generated_text" , "user_query" , "instructions" ]
@@ -120,7 +120,7 @@ def generate_response(context, query, instructions):
120120 # Call the decorated function
121121 context = "AI systems should be developed responsibly with proper oversight."
122122 query = "What does the text say about AI?"
123- instructions = "Provide a concise response with at most two sentences."
123+ instructions = [ "Provide a concise response with at most two sentences." ]
124124
125125 self .log_info ("Input - Context" , context )
126126 self .log_info ("Input - Query" , query )
@@ -143,7 +143,7 @@ def generate_response(context, query, instructions):
143143
144144 # Check key fields without verifying values
145145 assert "score" in result .detect_response .hallucination
146- assert "results " in result .detect_response .instruction_adherence
146+ assert "instructions_list " in result .detect_response .instruction_adherence
147147 assert "score" in result .detect_response .toxicity
148148
149149 def test_detect_with_different_iterables (self ):
@@ -482,7 +482,7 @@ def generate_summary(context, query):
482482
483483 def test_instruction_adherence_v1 (self ):
484484 """Test the Detect decorator with instruction adherence detector using v1."""
485- config = {'instruction_adherence' : {'detector_name' : 'v1 ' }}
485+ config = {'instruction_adherence' : {'detector_name' : 'default ' }}
486486 values_returned = ["context" , "generated_text" , "instructions" ]
487487
488488 self .log_info ("Test" , "Instruction Adherence with detector_name=v1" )
@@ -501,7 +501,7 @@ def generate_with_instructions(context, instructions):
501501 return context , generated_text , instructions
502502
503503 context = "Climate change and its effects on our planet."
504- instructions = "Provide a short response in one sentence."
504+ instructions = [ "Provide a short response in one sentence." ]
505505
506506 self .log_info ("Input - Context" , context )
507507 self .log_info ("Input - Instructions" , instructions )
@@ -519,7 +519,6 @@ def generate_with_instructions(context, instructions):
519519 assert isinstance (result , DetectResult )
520520 assert result .status == 200
521521 assert hasattr (result .detect_response , 'instruction_adherence' )
522- assert "results" in result .detect_response .instruction_adherence
523522
524523 def test_instruction_adherence_default (self ):
525524 """Test the Detect decorator with instruction adherence detector using default."""
@@ -596,7 +595,7 @@ def test_all_detectors_combination(self):
596595 config = {
597596 'hallucination' : {'detector_name' : 'default' },
598597 'toxicity' : {'detector_name' : 'default' },
599- 'instruction_adherence' : {'detector_name' : 'v1 ' }, # Using v1 format which expects a string
598+ 'instruction_adherence' : {'detector_name' : 'default ' },
600599 'retrieval_relevance' : {'detector_name' : 'default' },
601600 'conciseness' : {'detector_name' : 'default' },
602601 'completeness' : {'detector_name' : 'default' }
@@ -626,7 +625,7 @@ def comprehensive_response(context, query, instructions):
626625
627626 context = "Renewable energy sources like solar and wind are becoming increasingly cost-effective alternatives to fossil fuels."
628627 query = "What are the trends in renewable energy?"
629- instructions = "Provide a factual response based only on the given context."
628+ instructions = [ "Provide a factual response based only on the given context." ]
630629
631630 self .log_info ("Input - Context" , context )
632631 self .log_info ("Input - Query" , query )
@@ -722,3 +721,107 @@ def generate_with_multiple_instructions(context, instructions, query):
722721 self .log_info ("Error occurred during test" , str (e ))
723722 # Log the error but don't fail the test
724723 pytest .skip (f"Test skipped due to error: { str (e )} " )
724+
725+ def test_evaluate_with_new_model (self ):
726+ """Test the evaluate function with a new model name that should be auto-created."""
727+ import uuid
728+ from aimon import evaluate , Client
729+
730+ # Generate a unique model name to ensure it doesn't exist
731+ unique_model_name = f"test_model_{ uuid .uuid4 ().hex [:8 ]} "
732+ application_name = "test_application"
733+ evaluation_name = f"test_eval_{ uuid .uuid4 ().hex [:8 ]} "
734+
735+ self .log_info ("Test" , "Evaluate with new model auto-creation" )
736+ self .log_info ("Model Name" , unique_model_name )
737+ self .log_info ("Application Name" , application_name )
738+
739+ # Create client
740+ aimon_client = Client (auth_header = f"Bearer { self .api_key } " )
741+
742+ # Create a test dataset CSV in memory or file
743+ import tempfile
744+ import csv
745+
746+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.csv' , delete = False ) as tmp :
747+ writer = csv .writer (tmp )
748+ writer .writerow (["context_docs" , "user_query" , "output" ])
749+ writer .writerow ([
750+ "AI systems should be developed responsibly with proper oversight." ,
751+ "What does the text say about AI?" ,
752+ "The text states that AI systems should be developed responsibly with proper oversight."
753+ ])
754+ dataset_path = tmp .name
755+
756+ try :
757+ # Upload the dataset
758+ import json # Add import at top of function if not already there
759+ dataset_args = json .dumps ({"name" : "test_dataset.csv" , "description" : "Test dataset for evaluation" })
760+ with open (dataset_path , 'rb' ) as file :
761+ dataset = aimon_client .datasets .create (
762+ file = file ,
763+ json_data = dataset_args
764+ )
765+
766+ # Create dataset collection
767+ collection_name = f"test_collection_{ uuid .uuid4 ().hex [:8 ]} "
768+ collection = aimon_client .datasets .collection .create (
769+ name = collection_name ,
770+ dataset_ids = [dataset .sha ],
771+ description = "Test collection for evaluation"
772+ )
773+
774+ # Configure evaluation
775+ eval_config = {
776+ 'hallucination' : {'detector_name' : 'default' },
777+ 'toxicity' : {'detector_name' : 'default' }
778+ }
779+
780+ # Run evaluation
781+ results = evaluate (
782+ dataset_collection_name = collection_name ,
783+ headers = ["context_docs" , "user_query" , "output" ],
784+ application_name = application_name ,
785+ model_name = unique_model_name ,
786+ evaluation_name = evaluation_name ,
787+ api_key = self .api_key ,
788+ aimon_client = aimon_client ,
789+ config = eval_config
790+ )
791+
792+ self .log_info ("Evaluation Results" , results )
793+
794+ # Based on EvaluateResponse structure in aimon/decorators/evaluate.py
795+ assert results is not None
796+
797+ # EvaluateResponse likely contains 'evaluation_id' or other identifying information
798+ # Just verify it's not empty and log its structure for debugging
799+ self .log_info ("Results type" , type (results ))
800+
801+ # Log attributes if we can
802+ try :
803+ if hasattr (results , "__dict__" ):
804+ self .log_info ("Results attributes" , results .__dict__ )
805+ else :
806+ self .log_info ("Results dir" , dir (results ))
807+ except :
808+ self .log_info ("Could not log results attributes" )
809+
810+ # Check for common attributes in evaluation responses
811+ if hasattr (results , "evaluation_id" ):
812+ self .log_info ("Evaluation ID" , results .evaluation_id )
813+
814+ if hasattr (results , "task_id" ):
815+ self .log_info ("Task ID" , results .task_id )
816+
817+ self .log_info ("Result" , f"Successfully created and evaluated with new model: { unique_model_name } " )
818+
819+ except Exception as e :
820+ self .log_info ("Error occurred during test" , str (e ))
821+ raise
822+
823+ finally :
824+ # Cleanup
825+ import os
826+ if os .path .exists (dataset_path ):
827+ os .remove (dataset_path )
0 commit comments