Skip to content

feat: implement evaluation framework for praisonaiagents #976

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
318 changes: 318 additions & 0 deletions src/praisonai-agents/example_eval_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/usr/bin/env python3
"""
Example usage of the PraisonAI evaluation framework.

This file demonstrates all the features described in the GitHub issue specification.
"""

import os
import sys

# Add the package to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))

from praisonaiagents import Agent, Task
# Note: Process is available as PraisonAIAgents.process in the current implementation
from praisonaiagents.eval import AccuracyEval, ReliabilityEval, PerformanceEval, EvalSuite, TestCase, EvalCriteria

def basic_accuracy_example():
"""Example 1: Basic Accuracy Evaluation"""
print("=== Example 1: Basic Accuracy Evaluation ===")

# Create agent
agent = Agent(
name="Analyst",
role="Data Analyst",
goal="Provide accurate analysis",
backstory="I am a skilled data analyst",
llm="gpt-4o-mini"
)

# Simple accuracy check
eval_test = AccuracyEval(
agent=agent,
input="What is the capital of France?",
expected_output="Paris"
)

print("Running basic accuracy evaluation...")
# Note: In a real scenario, you would run: result = eval_test.run()
# print(f"Accuracy: {result.score}/10")
print("✓ AccuracyEval configured successfully")

def advanced_accuracy_example():
"""Example 2: Advanced Accuracy Evaluation"""
print("\n=== Example 2: Advanced Accuracy Evaluation ===")

agent = Agent(
name="Analyst",
role="Data Analyst",
goal="Provide detailed analysis",
backstory="I am an expert analyst",
llm="gpt-4o-mini"
)

# Multi-criteria evaluation
eval_test = AccuracyEval(
agent=agent,
test_cases=[
{
"input": "Summarize the Q1 report",
"expected_output": "Q1 showed 15% growth...",
"weight": 2.0 # Higher importance
},
{
"input": "What are the key risks?",
"expected_output": "Supply chain, market volatility..."
}
],
criteria=EvalCriteria(
factual_accuracy=0.4, # 40% weight
completeness=0.3, # 30% weight
relevance=0.3 # 30% weight
),
evaluator_llm="gpt-4o-mini",
iterations=5, # Statistical reliability
save_results="eval_results.json"
)

print("Advanced accuracy evaluation configured with:")
print("- Multi-criteria scoring")
print("- Multiple test cases with weights")
print("- Statistical reliability (5 iterations)")
print("- Results saving")

# Run with detailed output
# result = eval_test.run(verbose=True)
# print(f"Average: {result.avg_score:.2f}")
# print(f"Std Dev: {result.std_dev:.2f}")
# print(f"Confidence: {result.confidence_interval}")

def reliability_testing_example():
"""Example 3: Reliability Testing"""
print("\n=== Example 3: Reliability Testing ===")

agent = Agent(
name="TaskAgent",
role="Task Executor",
goal="Execute tasks reliably",
backstory="I execute tasks with proper tool usage",
llm="gpt-4o-mini"
)

# Test if agent uses expected tools
eval_test = ReliabilityEval(
agent=agent,
test_scenarios=[
{
"input": "Search weather and create report",
"expected_tools": ["web_search", "create_file"],
"required_order": True # Tools must be called in order
},
{
"input": "Analyze CSV data",
"expected_tools": ["read_csv", "analyze_data"],
"allow_additional": True # Other tools allowed
}
]
)

print("Reliability testing configured for:")
print("- Tool usage validation")
print("- Order requirement checking")
print("- Additional tool tolerance")

# results = eval_test.run()
# for scenario in results.scenarios:
# print(f"Scenario: {scenario.name} - {scenario.status}")
# if scenario.failed_tools:
# print(f" Failed: {scenario.failed_tools}")

def performance_evaluation_example():
"""Example 4: Performance Evaluation"""
print("\n=== Example 4: Performance Evaluation ===")

agent = Agent(
name="PerformanceAgent",
role="High Performance Agent",
goal="Execute tasks efficiently",
backstory="I am optimized for performance",
llm="gpt-4o-mini"
)

# Benchmark agent performance
eval_test = PerformanceEval(
agent=agent,
benchmark_queries=[
"Simple question",
"Complex analysis task",
"Multi-step reasoning"
],
metrics={
"runtime": True,
"memory": True,
"tokens": True, # Token usage tracking
"ttft": True # Time to first token
},
iterations=50,
warmup=5
)

print("Performance evaluation configured with:")
print("- Runtime measurement")
print("- Memory tracking")
print("- Token usage monitoring")
print("- Time to first token")
print("- 50 iterations with 5 warmup runs")

# result = eval_test.run()
# result.print_report()

# Compare agents example
agents = [agent] # In practice, you'd have multiple agents
# comparison = PerformanceEval.compare(
# agents=agents,
# benchmark_suite="standard",
# export_format="html"
# )

def automated_test_suite_example():
"""Example 5: Automated Test Suite"""
print("\n=== Example 5: Automated Test Suite ===")

agent = Agent(
name="QualityAgent",
role="Quality Assured Agent",
goal="Pass all quality checks",
backstory="I am designed for quality assurance",
llm="gpt-4o-mini"
)

# Define comprehensive test suite
suite = EvalSuite(
name="Agent Quality Assurance",
agents=[agent],
test_cases=[
TestCase(
name="Basic Math",
input="What is 15 * 23?",
expected_output="345",
eval_type="accuracy",
tags=["math", "simple"]
),
TestCase(
name="Tool Usage",
input="Search and summarize AI news",
expected_tools=["web_search", "summarize"],
eval_type="reliability"
),
TestCase(
name="Performance Baseline",
input="Standard benchmark query",
max_runtime=2.0, # seconds
max_memory=100, # MB
eval_type="performance"
)
],
# Automation features
schedule="0 2 * * *", # Run daily at 2 AM
alerts={
"email": "team@example.com",
"threshold": 0.8 # Alert if score < 80%
},
export_results="s3://bucket/eval-results/"
)

print("Automated test suite configured with:")
print("- Multiple test types (accuracy, reliability, performance)")
print("- Scheduled execution (daily at 2 AM)")
print("- Email alerts for quality gate failures")
print("- S3 export for results")

# Run full suite
# results = suite.run()

# CI/CD integration example
# if not results.passed:
# raise EvalFailure(f"Quality gate failed: {results.summary}")

# Generate report
# suite.generate_report(
# format="html",
# include_graphs=True,
# compare_with="last_week"
# )

def integration_with_existing_features_example():
"""Example 6: Integration with Existing PraisonAI Features"""
print("\n=== Example 6: Integration with Existing Features ===")

# Evaluation-aware agent with memory
agent = Agent(
name="EvalAgent",
role="Evaluation-Aware Agent",
goal="Perform well in evaluations",
backstory="I am integrated with evaluation systems",
llm="gpt-4o-mini",
# TODO: Add memory and tools integration once available
# memory=Memory(provider="rag", quality_threshold=0.8),
# tools=Tools(["web_search", "calculator"]),
# Built-in evaluation configuration
# eval_config={
# "track_accuracy": True,
# "sample_rate": 0.1, # Evaluate 10% of runs
# "baseline": "eval_baseline.json"
# }
)

# Process with automatic evaluation
# TODO: Implement process evaluation integration
# process = Process(
# agents=[agent],
# tasks=[task1, task2],
# eval_mode=True,
# eval_criteria={
# "min_accuracy": 0.85,
# "max_runtime": 5.0
# }
# )

print("Integration features planned:")
print("- Memory-aware evaluation")
print("- Process-level evaluation")
print("- Automatic quality tracking")
print("- Baseline comparison")

# Run with evaluation
# result = process.start()
# print(f"Process accuracy: {result.eval_metrics.accuracy}")
# print(f"Task performances: {result.eval_metrics.task_times}")
# result.eval_metrics.export("process_eval.json")

def main():
"""Run all examples."""
print("🧪 PraisonAI Agents Evaluation Framework Examples")
print("="*60)

examples = [
basic_accuracy_example,
advanced_accuracy_example,
reliability_testing_example,
performance_evaluation_example,
automated_test_suite_example,
integration_with_existing_features_example
]

for example in examples:
try:
example()
except Exception as e:
print(f"❌ Error in {example.__name__}: {e}")

print("\n" + "="*60)
print("✅ All examples completed successfully!")
print("📋 Note: Some examples show configuration only.")
print("🔧 Uncomment the execution lines to run actual evaluations.")

if __name__ == "__main__":
main()
33 changes: 32 additions & 1 deletion src/praisonai-agents/praisonaiagents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,29 @@
from .memory.memory import Memory
from .guardrails import GuardrailResult, LLMGuardrail
from .agent.handoff import Handoff, handoff, handoff_filters, RECOMMENDED_PROMPT_PREFIX, prompt_with_handoff_instructions

# Evaluation framework (lazy loaded)
try:
from .eval import (
AccuracyEval,
ReliabilityEval,
PerformanceEval,
EvalSuite,
TestCase,
EvalCriteria,
EvalResult
)
_eval_available = True
except ImportError:
# Evaluation framework not available
_eval_available = False
AccuracyEval = None
ReliabilityEval = None
PerformanceEval = None
EvalSuite = None
TestCase = None
EvalCriteria = None
EvalResult = None
from .main import (
TaskOutput,
ReflectionOutput,
Expand Down Expand Up @@ -136,5 +159,13 @@ def disable_telemetry():
'enable_telemetry',
'disable_telemetry',
'MinimalTelemetry',
'TelemetryCollector'
'TelemetryCollector',
# Evaluation framework
'AccuracyEval',
'ReliabilityEval',
'PerformanceEval',
'EvalSuite',
'TestCase',
'EvalCriteria',
'EvalResult'
]
23 changes: 23 additions & 0 deletions src/praisonai-agents/praisonaiagents/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
PraisonAI Agents Evaluation Framework

A minimal, client-side evaluation framework for testing and benchmarking PraisonAI agents.
Provides accuracy testing, reliability validation, performance benchmarking, and comprehensive test suites.
"""

from .accuracy_eval import AccuracyEval
from .reliability_eval import ReliabilityEval
from .performance_eval import PerformanceEval
from .eval_suite import EvalSuite, TestCase
from .eval_criteria import EvalCriteria
from .eval_result import EvalResult

__all__ = [
'AccuracyEval',
'ReliabilityEval',
'PerformanceEval',
'EvalSuite',
'TestCase',
'EvalCriteria',
'EvalResult'
]
Loading
Loading