diff --git a/docs/AI_PROVIDERS.md b/docs/AI_PROVIDERS.md new file mode 100644 index 0000000..01a4feb --- /dev/null +++ b/docs/AI_PROVIDERS.md @@ -0,0 +1,171 @@ +# AI Provider Configuration + +This document explains how to configure different AI providers for the GUM system. + +## Overview + +The GUM system uses a unified AI client that supports multiple providers for different tasks: + +- **Text Completion**: Azure OpenAI (default) or OpenAI +- **Vision Completion**: OpenRouter (default) + +## Provider Configuration + +### Text Providers + +#### Azure OpenAI (Default) +```bash +# Required environment variables +export AZURE_OPENAI_API_KEY="your-azure-api-key" +export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/" +export AZURE_OPENAI_API_VERSION="2024-02-15-preview" +export AZURE_OPENAI_DEPLOYMENT="gpt-4o" # Optional, defaults to gpt-4o + +# Optional: Explicitly set text provider (defaults to azure) +export TEXT_PROVIDER="azure" +``` + +#### OpenAI +```bash +# Required environment variables +export OPENAI_API_KEY="your-openai-api-key" + +# Optional environment variables +export OPENAI_MODEL="gpt-4o" # Optional, defaults to gpt-4o +export OPENAI_API_BASE="https://api.openai.com/v1" # Optional, uses default +export OPENAI_ORGANIZATION="your-org-id" # Optional + +# Set text provider to OpenAI +export TEXT_PROVIDER="openai" +``` + +### Vision Providers + +#### OpenRouter (Default) +```bash +# Required environment variables +export OPENROUTER_API_KEY="your-openrouter-api-key" + +# Optional environment variables +export OPENROUTER_MODEL="qwen/qwen-2.5-vl-72b-instruct:free" # Optional, uses default + +# Optional: Explicitly set vision provider (defaults to openrouter) +export VISION_PROVIDER="openrouter" +``` + +## Usage Examples + +### Using Azure OpenAI for Text (Default) +```python +import asyncio +from gum import gum +from gum.observers import Observer + +async def main(): + # No special configuration needed - Azure is the default + async with gum("username", "model") as g: + # Your GUM code here + pass + +asyncio.run(main()) +``` + +### Using OpenAI for Text +```python +import asyncio +import os +from gum import gum +from gum.observers import Observer + +async def main(): + # Set OpenAI as text provider + os.environ["TEXT_PROVIDER"] = "openai" + + async with gum("username", "model") as g: + # Your GUM code here + pass + +asyncio.run(main()) +``` + +### Testing Different Providers + +#### Test OpenAI Client +```bash +python test_openai_client.py +``` + +#### Test Unified Client with Different Providers +```bash +# Test with Azure OpenAI (default) +python -c "import asyncio; from unified_ai_client import test_unified_client; asyncio.run(test_unified_client())" + +# Test with OpenAI +TEXT_PROVIDER=openai python -c "import asyncio; from unified_ai_client import test_unified_client; asyncio.run(test_unified_client())" +``` + +## Provider Features + +| Provider | Text Completion | Vision Completion | Notes | +|----------|----------------|-------------------|-------| +| Azure OpenAI | | | Enterprise-grade, requires Azure subscription | +| OpenAI | | | Direct OpenAI API, requires OpenAI account | +| OpenRouter | | | Multiple vision models, cost-effective | + +## Error Handling + +The unified client includes automatic retry logic with exponential backoff for transient errors. You can configure retry behavior: + +```python +from unified_ai_client import UnifiedAIClient + +client = UnifiedAIClient( + max_retries=5, # Maximum retry attempts + base_delay=2.0, # Base delay in seconds + max_delay=120.0, # Maximum delay between retries + backoff_factor=2.0, # Exponential backoff multiplier + jitter_factor=0.1 # Random jitter to prevent thundering herd +) +``` + +## Environment Variables Reference + +### Azure OpenAI +- `AZURE_OPENAI_API_KEY` (required) +- `AZURE_OPENAI_ENDPOINT` (required) +- `AZURE_OPENAI_API_VERSION` (required) +- `AZURE_OPENAI_DEPLOYMENT` (optional, defaults to "gpt-4o") + +### OpenAI +- `OPENAI_API_KEY` (required) +- `OPENAI_MODEL` (optional, defaults to "gpt-4o") +- `OPENAI_API_BASE` (optional, defaults to "https://api.openai.com/v1") +- `OPENAI_ORGANIZATION` (optional) + +### OpenRouter +- `OPENROUTER_API_KEY` (required) +- `OPENROUTER_MODEL` (optional, defaults to "qwen/qwen-2.5-vl-72b-instruct:free") + +### Provider Selection +- `TEXT_PROVIDER` (optional, "azure" or "openai", defaults to "azure") +- `VISION_PROVIDER` (optional, "openrouter", defaults to "openrouter") + +## Troubleshooting + +### Common Issues + +1. **Missing API Keys**: Ensure all required environment variables are set +2. **Network Issues**: Check firewall/proxy settings +3. **Rate Limits**: The client includes automatic retry with backoff +4. **Model Availability**: Verify the model name is correct for your provider + +### Debug Logging + +Enable debug logging to troubleshoot issues: + +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +This will show detailed HTTP requests and responses for debugging. diff --git a/docs/requirements.txt b/docs/requirements.txt index 9dec04d..194f2b9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,41 @@ mkdocs-material>=9.0.0 mkdocstrings>=0.24.0 mkdocstrings-python>=1.7.0 mistune==3.0.2 -# pymdown-extensions>=10.0.0 \ No newline at end of file +# pymdown-extensions>=10.0.0 + +# Core dependencies for GUM (General User Models) +# Image processing and screen capture +pillow +mss +pynput +shapely + +# macOS window management (conditionally installed) +pyobjc-framework-Quartz; sys_platform == "darwin" + +# AI and OpenAI clients +openai>=1.0.0 + +# Database and ORM +SQLAlchemy>=2.0.0 +aiosqlite +greenlet + +# Data validation and serialization +pydantic>=2.0.0 + +# Environment and configuration +python-dotenv>=1.0.0 + +# Machine learning and data processing +scikit-learn +numpy + +# Date/time utilities +python-dateutil + +# Development and building tools (optional) +setuptools>=42 +wheel +build +twine diff --git a/gum/azure_text_client.py b/gum/azure_text_client.py new file mode 100644 index 0000000..2baaf9e --- /dev/null +++ b/gum/azure_text_client.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Azure OpenAI Text Completion Utility + +This utility handles text completions using the official Azure OpenAI Python SDK +with proper error handling and logging. +""" + +import asyncio +import os +import logging +from typing import List, Dict, Any +from dotenv import load_dotenv +from openai import AsyncAzureOpenAI + +# Load environment variables at module level, override existing ones +load_dotenv(override=True) + +# Set up logging with debug level for httpx +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Enable httpx debug logging to see exact HTTP requests +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.DEBUG) +httpx_handler = logging.StreamHandler() +httpx_handler.setFormatter(logging.Formatter("HTTPX: %(message)s")) +httpx_logger.addHandler(httpx_handler) + + +class AzureOpenAITextClient: + """Azure OpenAI client for text completions using the official Azure OpenAI SDK.""" + + def __init__(self): + self.api_key = os.getenv("AZURE_OPENAI_API_KEY") + self.endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + self.api_version = os.getenv("AZURE_OPENAI_API_VERSION") + self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o") + + logger.info("Azure OpenAI Environment Debug:") + logger.info(f" API Key: {self.api_key[:10] + '...' + self.api_key[-4:] if self.api_key else 'None'}") + logger.info(f" Endpoint: {self.endpoint}") + logger.info(f" API Version: {self.api_version}") + logger.info(f" Deployment: {self.deployment}") + + if not all([self.api_key, self.endpoint, self.api_version]): + raise ValueError("Azure OpenAI configuration incomplete. Check environment variables.") + + # Initialize the Azure OpenAI client + self.client = AsyncAzureOpenAI( + api_key=self.api_key, + azure_endpoint=self.endpoint, # type: ignore + api_version=self.api_version + ) + + logger.info("Azure OpenAI Text Client initialized") + logger.info(f" Endpoint: {self.endpoint}") + logger.info(f" Deployment: {self.deployment}") + logger.info(f" API Version: {self.api_version}") + + async def chat_completion( + self, + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 + ) -> str: + """ + Send a chat completion request to Azure OpenAI. + + Args: + messages: List of message dictionaries + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + + logger.info("Azure OpenAI text completion request") + logger.info(f" Deployment: {self.deployment}") + logger.info(f" Messages: {len(messages)} message(s)") + logger.info(f" Max tokens: {max_tokens}") + + try: + response = await self.client.chat.completions.create( + model=self.deployment, # Use deployment name as model + messages=messages, # type: ignore + max_tokens=max_tokens, + temperature=temperature + ) + + content = response.choices[0].message.content + + if content: + logger.info("Azure OpenAI success") + logger.info(f" Response length: {len(content)} characters") + return content + else: + error_msg = "Azure OpenAI returned empty response" + logger.error(f"Error: {error_msg}") + raise ValueError(error_msg) + + except Exception as e: + error_msg = f"Azure OpenAI request failed: {str(e)}" + logger.error(f"Error: {error_msg}") + raise + + +# Global client instance +_azure_client = None + +async def get_azure_text_client() -> AzureOpenAITextClient: + """Get the global Azure OpenAI text client instance.""" + global _azure_client + if _azure_client is None: + _azure_client = AzureOpenAITextClient() + return _azure_client + + +async def azure_text_completion( + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for Azure OpenAI text completion. + + Args: + messages: List of message dictionaries + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_azure_text_client() + return await client.chat_completion(messages, max_tokens, temperature) + + +async def test_azure_text_client(): + """Test the Azure OpenAI text client.""" + + print("Testing Azure OpenAI Text Client...") + + test_messages = [ + {"role": "user", "content": "Hello! Please respond with exactly 'Azure OpenAI text working correctly'."} + ] + + try: + response = await azure_text_completion( + messages=test_messages, + max_tokens=20, + temperature=0.0 + ) + print(f"Azure OpenAI Text Success: {response}") + return True + except Exception as e: + print(f"Azure OpenAI Text Failed: {e}") + return False + + +if __name__ == "__main__": + success = asyncio.run(test_azure_text_client()) + if success: + print("Azure OpenAI text client is working!") + else: + print("Azure OpenAI text client has issues.") \ No newline at end of file diff --git a/gum/gum.py b/gum/gum.py index b4ef53a..2d8b69c 100644 --- a/gum/gum.py +++ b/gum/gum.py @@ -13,7 +13,6 @@ from .models import observation_proposition import traceback -from openai import AsyncOpenAI from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import insert @@ -28,7 +27,6 @@ PropositionSchema, RelationSchema, Update, - get_schema, AuditSchema ) from gum.prompts.gum import AUDIT_PROMPT, PROPOSE_PROMPT, REVISE_PROMPT, SIMILAR_PROMPT @@ -41,6 +39,10 @@ class gum: propositions about user behavior, and maintaining relationships between observations and propositions. + The system uses a unified AI client that supports multiple providers: + - Text completion: Azure OpenAI (default) or OpenAI (set TEXT_PROVIDER=openai) + - Vision completion: OpenRouter (default) + Args: user_name (str): The name of the user being modeled. *observers (Observer): Variable number of observer instances to track user behavior. @@ -53,6 +55,8 @@ class gum: verbosity (int, optional): Logging verbosity level. Defaults to logging.INFO. audit_enabled (bool, optional): Whether to enable auditing. Defaults to False. + api_base (str, optional): Deprecated, use environment variables instead. + api_key (str, optional): Deprecated, use environment variables instead. """ def __init__( @@ -95,16 +99,14 @@ def __init__( h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) self.logger.addHandler(h) - # prompts + # prompts - use default prompts from gum.py, or load from files if custom methods are added self.propose_prompt = propose_prompt or PROPOSE_PROMPT self.similar_prompt = similar_prompt or SIMILAR_PROMPT self.revise_prompt = revise_prompt or REVISE_PROMPT self.audit_prompt = audit_prompt or AUDIT_PROMPT - self.client = AsyncOpenAI( - base_url=api_base or os.getenv("GUM_LM_API_BASE"), - api_key=api_key or os.getenv("GUM_LM_API_KEY") or os.getenv("OPENAI_API_KEY") or "None" - ) + # Initialize unified AI client (supports Azure OpenAI, OpenAI, and OpenRouter) + self.ai_client = None # Will be initialized lazily self.engine = None self.Session = None @@ -123,6 +125,146 @@ def __init__( self._batch_processing_lock = asyncio.Lock() self.update_handlers: list[Callable[[Observer, Update], None]] = [] + async def _get_ai_client(self): + """Get the unified AI client, initializing it if needed.""" + if self.ai_client is None: + # Import here to avoid circular imports + from gum.unified_ai_client import get_unified_client + self.ai_client = await get_unified_client() + self.logger.info("Unified AI client initialized for GUM") + return self.ai_client + + def _parse_ai_json_response(self, response_content: str, expected_key: str = None): + """Parse JSON from AI response content with fallback handling.""" + + # Add detailed logging for debugging + self.logger.info(f"JSON Parsing Debug - Input Analysis:") + self.logger.info(f" Response length: {len(response_content)} characters") + self.logger.info(f" Expected key: {expected_key}") + self.logger.info(f" Raw response (first 300 chars): {response_content[:300]}...") + self.logger.info(f" Raw response (last 100 chars): ...{response_content[-100:]}") + + try: + # Try to parse the response as JSON directly + self.logger.info("Attempting direct JSON parsing...") + parsed_response = json.loads(response_content) + self.logger.info(f"Direct JSON parsing successful!") + self.logger.info(f" Parsed keys: {list(parsed_response.keys()) if isinstance(parsed_response, dict) else 'Not a dict'}") + + if expected_key and expected_key in parsed_response: + self.logger.info(f"Found expected key '{expected_key}' in response") + return parsed_response[expected_key] + return parsed_response + + except json.JSONDecodeError as e: + self.logger.warning(f"Direct JSON parsing failed: {e}") + + # Try to extract JSON from markdown code blocks first + import re + + self.logger.info("Attempting markdown JSON extraction...") + # Look for JSON in markdown code blocks - handle both complete and truncated blocks + markdown_match = re.search(r'```(?:json)?\s*(.*?)(?:```|\Z)', response_content, re.DOTALL) + if markdown_match: + json_content = markdown_match.group(1).strip() + self.logger.info(f"Extracted JSON from markdown (length: {len(json_content)}):") + self.logger.info(f" Extracted content: {json_content}") + + # Try to parse the extracted content as-is first + try: + parsed_response = json.loads(json_content) + self.logger.info(f"Markdown JSON parsing successful!") + self.logger.info(f" Parsed keys: {list(parsed_response.keys()) if isinstance(parsed_response, dict) else 'Not a dict'}") + + if expected_key and expected_key in parsed_response: + self.logger.info(f"Found expected key '{expected_key}' in markdown JSON") + return parsed_response[expected_key] + return parsed_response + + except json.JSONDecodeError as e: + self.logger.warning(f"Failed to parse extracted JSON as-is: {e}") + + # Try to repair truncated JSON by finding the last complete object + self.logger.info("Attempting to repair truncated JSON...") + + # Find the last complete JSON object/array + brace_count = 0 + bracket_count = 0 + last_complete_pos = -1 + + for i, char in enumerate(json_content): + if char == '{': + brace_count += 1 + elif char == '}': + brace_count -= 1 + elif char == '[': + bracket_count += 1 + elif char == ']': + bracket_count -= 1 + + # Check if we have a complete structure + if brace_count == 0 and bracket_count == 0 and i > 0: + last_complete_pos = i + 1 + + if last_complete_pos > 0: + repaired_json = json_content[:last_complete_pos] + self.logger.info(f"Attempting repair with content up to position {last_complete_pos}") + self.logger.info(f" Repaired JSON: {repaired_json}") + + try: + parsed_response = json.loads(repaired_json) + self.logger.info(f"Repaired JSON parsing successful!") + self.logger.info(f" Parsed keys: {list(parsed_response.keys()) if isinstance(parsed_response, dict) else 'Not a dict'}") + + if expected_key and expected_key in parsed_response: + self.logger.info(f"Found expected key '{expected_key}' in repaired JSON") + return parsed_response[expected_key] + return parsed_response + + except json.JSONDecodeError as e2: + self.logger.error(f"Repair attempt also failed: {e2}") + else: + self.logger.warning("Could not find a complete JSON structure to repair") + + else: + self.logger.warning("No markdown code blocks found") + + # Fallback: look for any JSON-like structure + self.logger.info("Attempting fallback JSON pattern matching...") + json_match = re.search(r'\{.*\}', response_content, re.DOTALL) + if json_match: + json_candidate = json_match.group(0) + self.logger.info(f"Found JSON-like pattern (length: {len(json_candidate)}):") + self.logger.info(f" JSON candidate: {json_candidate}") + + try: + parsed_response = json.loads(json_candidate) + self.logger.info(f"Fallback JSON parsing successful!") + self.logger.info(f" Parsed keys: {list(parsed_response.keys()) if isinstance(parsed_response, dict) else 'Not a dict'}") + + if expected_key and expected_key in parsed_response: + self.logger.info(f"Found expected key '{expected_key}' in fallback JSON") + return parsed_response[expected_key] + return parsed_response + + except json.JSONDecodeError as e: + self.logger.error(f"Fallback JSON parsing also failed: {e}") + self.logger.error(f" Problematic content: {json_candidate}") + else: + self.logger.warning("No JSON-like patterns found") + + # If JSON parsing fails, log the response and return appropriate default + self.logger.error(f"ALL JSON parsing methods failed!") + self.logger.error(f" Full response content:") + self.logger.error(f" {response_content}") + + if expected_key == "propositions": + self.logger.info("Returning empty list for propositions") + return [] + else: + self.logger.info("Returning empty dict") + return {} + def start_update_loop(self): """Start the asynchronous update loop for processing observer updates.""" if self._loop_task is None: @@ -295,14 +437,18 @@ async def _construct_propositions(self, update: Update) -> list[PropositionItem] .replace("{inputs}", update.content) ) - schema = PropositionSchema.model_json_schema() - rsp = await self.client.chat.completions.create( - model=self.model, + # Get the unified AI client + client = await self._get_ai_client() + + # Make the API call using the unified client + response_content = await client.text_completion( messages=[{"role": "user", "content": prompt}], - response_format=get_schema(schema), + max_tokens=2000, + temperature=0.1 ) - return json.loads(rsp.choices[0].message.content)["propositions"] + # Parse the JSON response + return self._parse_ai_json_response(response_content, "propositions") async def _build_relation_prompt(self, all_props) -> str: """Build a prompt for analyzing relationships between propositions. @@ -341,13 +487,24 @@ async def _filter_propositions( ] prompt_text = await self._build_relation_prompt(payload) - rsp = await self.client.chat.completions.create( - model=self.model, + # Get the unified AI client + client = await self._get_ai_client() + + # Make the API call using the unified client + response_content = await client.text_completion( messages=[{"role": "user", "content": prompt_text}], - response_format=get_schema(RelationSchema.model_json_schema()), + max_tokens=2000, + temperature=0.1 ) - data = RelationSchema.model_validate_json(rsp.choices[0].message.content) + # Parse the JSON response and validate + try: + relations_data = self._parse_ai_json_response(response_content, "relations") + data = RelationSchema.model_validate({"relations": relations_data}) + except Exception as e: + self.logger.error(f"Failed to parse relation data: {e}") + # Return empty groups if parsing fails + return [], [], [] id_to_prop = {p.id: p for p in rel_props} ident, sim, unrel = set(), set(), set() @@ -410,13 +567,20 @@ async def _revise_propositions( list[dict]: List of revised propositions. """ body = await self._build_revision_body(similar_cluster, related_obs) - prompt = self.revise_prompt.replace("{body}", body) - rsp = await self.client.chat.completions.create( - model=self.model, + prompt = self.revise_prompt.replace("{body}", body).replace("{user_name}", self.user_name) + + # Get the unified AI client + client = await self._get_ai_client() + + # Make the API call using the unified client + response_content = await client.text_completion( messages=[{"role": "user", "content": prompt}], - response_format=get_schema(PropositionSchema.model_json_schema()), + max_tokens=2000, + temperature=0.1 ) - return json.loads(rsp.choices[0].message.content)["propositions"] + + # Parse the JSON response + return self._parse_ai_json_response(response_content, "propositions") async def _generate_and_search( self, session: AsyncSession, update: Update @@ -550,19 +714,29 @@ async def _handle_audit(self, obs: Observation) -> bool: .replace("{user_name}", self.user_name) ) - rsp = await self.client.chat.completions.create( - model=self.model, + # Get the unified AI client + client = await self._get_ai_client() + + # Make the API call using the unified client + response_content = await client.text_completion( messages=[{"role": "user", "content": prompt}], - response_format=get_schema(AuditSchema.model_json_schema()), - temperature=0.0, + max_tokens=1000, + temperature=0.0 ) - decision = json.loads(rsp.choices[0].message.content) - if not decision["transmit_data"]: + # Parse the JSON response + decision = self._parse_ai_json_response(response_content) + + # Safely handle the decision with fallbacks + transmit_data = decision.get("transmit_data", True) if isinstance(decision, dict) else True + data_type = decision.get("data_type", "Unknown") if isinstance(decision, dict) else "Unknown" + subject = decision.get("subject", "Unknown") if isinstance(decision, dict) else "Unknown" + + if not transmit_data: self.logger.warning( "Audit blocked transmission (data_type=%s, subject=%s)", - decision["data_type"], - decision["subject"], + data_type, + subject, ) return True diff --git a/gum/observers/screen.py b/gum/observers/screen.py index 726a449..be2b64d 100644 --- a/gum/observers/screen.py +++ b/gum/observers/screen.py @@ -7,6 +7,7 @@ import base64 import logging import os +import sys import time from collections import deque from typing import Any, Dict, Iterable, List, Optional @@ -15,7 +16,15 @@ # — Third-party — import mss -import Quartz +# Conditional import for macOS-specific Quartz module +try: + if sys.platform == "darwin": + import Quartz + else: + Quartz = None +except ImportError: + Quartz = None + from PIL import Image from pynput import mouse # still synchronous from shapely.geometry import box @@ -25,9 +34,6 @@ from .observer import Observer from ..schemas import Update -# — OpenAI async client — -from openai import AsyncOpenAI - # — Local — from gum.prompts.screen import TRANSCRIPTION_PROMPT, SUMMARY_PROMPT @@ -43,6 +49,19 @@ def _get_global_bounds() -> tuple[float, float, float, float]: ------- (min_x, min_y, max_x, max_y) tuple in Quartz global coordinates. """ + if Quartz is None: + # Fallback for non-macOS systems - use mss to get monitor bounds + with mss.mss() as sct: + monitors = sct.monitors[1:] # Skip the "all monitors" entry + if not monitors: + return 0, 0, 1920, 1080 # Default fallback + + min_x = min(mon["left"] for mon in monitors) + min_y = min(mon["top"] for mon in monitors) + max_x = max(mon["left"] + mon["width"] for mon in monitors) + max_y = max(mon["top"] + mon["height"] for mon in monitors) + return min_x, min_y, max_x, max_y + err, ids, cnt = Quartz.CGGetActiveDisplayList(16, None, None) if err != Quartz.kCGErrorSuccess: # pragma: no cover (defensive) raise OSError(f"CGGetActiveDisplayList failed: {err}") @@ -65,6 +84,11 @@ def _get_visible_windows() -> List[tuple[dict, float]]: is in ``[0.0, 1.0]``. Internal system windows (Dock, WindowServer, …) are ignored. """ + if Quartz is None: + # Fallback for non-macOS systems - return empty list + # Window management functionality is not available + return [] + _, _, _, gmax_y = _get_global_bounds() opts = ( @@ -107,6 +131,10 @@ def _get_visible_windows() -> List[tuple[dict, float]]: def _is_app_visible(names: Iterable[str]) -> bool: """Return *True* if **any** window from *names* is at least partially visible.""" + if Quartz is None: + # Fallback for non-macOS systems - assume app is visible + return True + targets = set(names) return any( info.get("kCGWindowOwnerName", "") in targets and ratio > 0 @@ -191,17 +219,21 @@ def __init__( self._history: deque[str] = deque(maxlen=max(0, history_k)) self._pending_event: Optional[dict] = None self._debounce_handle: Optional[asyncio.TimerHandle] = None - self.client = AsyncOpenAI( - # try the class, then the env for screen, then the env for gum - base_url=api_base or os.getenv("SCREEN_LM_API_BASE") or os.getenv("GUM_LM_API_BASE"), - - # try the class, then the env for screen, then the env for GUM, then none - api_key=api_key or os.getenv("SCREEN_LM_API_KEY") or os.getenv("GUM_LM_API_KEY") or os.getenv("OPENAI_API_KEY") or "None" - ) + + # Initialize unified AI client (will be set up lazily) + self.ai_client = None # call parent super().__init__() + async def _get_ai_client(self): + """Get the unified AI client, initializing it if needed.""" + if self.ai_client is None: + # Import here to avoid circular imports + from gum.unified_ai_client import get_unified_client + self.ai_client = await get_unified_client() + return self.ai_client + # ─────────────────────────────── tiny sync helpers @staticmethod def _mon_for(x: float, y: float, mons: list[dict]) -> Optional[int]: @@ -233,9 +265,9 @@ def _encode_image(img_path: str) -> str: with open(img_path, "rb") as fh: return base64.b64encode(fh.read()).decode() - # ─────────────────────────────── OpenAI Vision (async) + # ─────────────────────────────── Vision Analysis (async) async def _call_gpt_vision(self, prompt: str, img_paths: list[str]) -> str: - """Call GPT Vision API to analyze images. + """Call GPT Vision API to analyze images using unified AI client. Args: prompt (str): Prompt to guide the analysis. @@ -244,23 +276,29 @@ async def _call_gpt_vision(self, prompt: str, img_paths: list[str]) -> str: Returns: str: GPT's analysis of the images. """ - content = [ - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}, - } - for encoded in (await asyncio.gather( - *[asyncio.to_thread(self._encode_image, p) for p in img_paths] - )) - ] - content.append({"type": "text", "text": prompt}) - - rsp = await self.client.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": content}], - response_format={"type": "text"}, + # Encode images to base64 + encoded_images = await asyncio.gather( + *[asyncio.to_thread(self._encode_image, p) for p in img_paths] + ) + + # Use the first image for the unified client (it expects single image) + # For multiple images, we'll concatenate them or use the most recent + if len(encoded_images) == 1: + base64_image = encoded_images[0] + else: + # For multiple images, use the last one (most recent) + # In the future, we could enhance the unified client to handle multiple images + base64_image = encoded_images[-1] + + # Get the unified AI client and use vision completion + client = await self._get_ai_client() + + return await client.vision_completion( + text_prompt=prompt, + base64_image=base64_image, + max_tokens=2000, + temperature=0.1 ) - return rsp.choices[0].message.content # ─────────────────────────────── I/O helpers async def _save_frame(self, frame, tag: str) -> str: diff --git a/gum/openai_text_client.py b/gum/openai_text_client.py new file mode 100644 index 0000000..3f17dda --- /dev/null +++ b/gum/openai_text_client.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +""" +OpenAI Text Completion Utility + +This utility handles text completions using the official OpenAI Python SDK +with proper error handling and logging. +""" + +import asyncio +import os +import logging +from typing import List, Dict, Any +from dotenv import load_dotenv +from openai import AsyncOpenAI + +# Load environment variables at module level, override existing ones +load_dotenv(override=True) + +# Set up logging with debug level for httpx +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Enable httpx debug logging to see exact HTTP requests +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.DEBUG) +httpx_handler = logging.StreamHandler() +httpx_handler.setFormatter(logging.Formatter("HTTPX: %(message)s")) +httpx_logger.addHandler(httpx_handler) + + +class OpenAITextClient: + """OpenAI client for text completions using the official OpenAI SDK.""" + + def __init__(self): + self.api_key = os.getenv("OPENAI_API_KEY") + self.api_base = os.getenv("OPENAI_API_BASE") + self.model = os.getenv("OPENAI_MODEL", "gpt-4o") + self.organization = os.getenv("OPENAI_ORGANIZATION") + + logger.info("OpenAI Environment Debug:") + logger.info(f" API Key: {self.api_key[:10] + '...' + self.api_key[-4:] if self.api_key else 'None'}") + logger.info(f" API Base: {self.api_base or 'Default (https://api.openai.com/v1)'}") + logger.info(f" Model: {self.model}") + logger.info(f" Organization: {self.organization or 'None'}") + + if not self.api_key: + raise ValueError("OpenAI API key not found. Check OPENAI_API_KEY environment variable.") + + # Initialize the OpenAI client with optional parameters + client_kwargs = { + "api_key": self.api_key, + } + + if self.api_base: + client_kwargs["base_url"] = self.api_base + + if self.organization: + client_kwargs["organization"] = self.organization + + self.client = AsyncOpenAI(**client_kwargs) + + logger.info("OpenAI Text Client initialized") + logger.info(f" API Base: {self.api_base or 'Default'}") + logger.info(f" Model: {self.model}") + logger.info(f" Organization: {self.organization or 'Default'}") + + async def chat_completion( + self, + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 + ) -> str: + """ + Send a chat completion request to OpenAI. + + Args: + messages: List of message dictionaries + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + + logger.info("OpenAI text completion request") + logger.info(f" Model: {self.model}") + logger.info(f" Messages: {len(messages)} message(s)") + logger.info(f" Max tokens: {max_tokens}") + + try: + response = await self.client.chat.completions.create( + model=self.model, + messages=messages, # type: ignore + max_tokens=max_tokens, + temperature=temperature + ) + + content = response.choices[0].message.content + + if content: + logger.info("OpenAI success") + logger.info(f" Response length: {len(content)} characters") + return content + else: + error_msg = "OpenAI returned empty response" + logger.error(f"Error: {error_msg}") + raise ValueError(error_msg) + + except Exception as e: + error_msg = f"OpenAI request failed: {str(e)}" + logger.error(f"Error: {error_msg}") + raise + + +# Global client instance +_openai_client = None + +async def get_openai_text_client() -> OpenAITextClient: + """Get the global OpenAI text client instance.""" + global _openai_client + if _openai_client is None: + _openai_client = OpenAITextClient() + return _openai_client + + +async def openai_text_completion( + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for OpenAI text completion. + + Args: + messages: List of message dictionaries + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_openai_text_client() + return await client.chat_completion(messages, max_tokens, temperature) + + +async def test_openai_text_client(): + """Test the OpenAI text client.""" + + print("Testing OpenAI Text Client...") + + test_messages = [ + {"role": "user", "content": "Hello! Please respond with exactly 'OpenAI text working correctly'."} + ] + + try: + response = await openai_text_completion( + messages=test_messages, + max_tokens=20, + temperature=0.0 + ) + print(f"OpenAI Text Success: {response}") + return True + except Exception as e: + print(f"OpenAI Text Failed: {e}") + return False + + +if __name__ == "__main__": + success = asyncio.run(test_openai_text_client()) + if success: + print("OpenAI text client is working!") + else: + print("OpenAI text client has issues.") diff --git a/gum/openrouter_vision_client.py b/gum/openrouter_vision_client.py new file mode 100644 index 0000000..babc8c9 --- /dev/null +++ b/gum/openrouter_vision_client.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +OpenRouter Vision Completion Utility + +This utility handles vision completions using OpenRouter with proper error handling +and logging. Uses aiohttp for direct HTTP calls. +""" + +import aiohttp +import asyncio +import json +import os +import logging +from typing import List, Dict, Any +from dotenv import load_dotenv +import ssl + +# Load environment variables at module level, override existing ones +load_dotenv(override=True) + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class OpenRouterVisionClient: + """OpenRouter client for vision completions using aiohttp.""" + + def __init__(self): + self.api_key = os.getenv("OPENROUTER_API_KEY") + self.model = os.getenv("OPENROUTER_MODEL", "qwen/qwen-2.5-vl-72b-instruct:free") + self.api_url = "https://openrouter.ai/api/v1/chat/completions" + + if not self.api_key: + raise ValueError("OpenRouter API key not found. Check OPENROUTER_API_KEY environment variable.") + + logger.info("OpenRouter Vision Client initialized") + logger.info(f" Model: {self.model}") + logger.info(f" API Key: {self.api_key[:10] + '...' + self.api_key[-4:] if self.api_key else 'None'}") + + async def vision_completion( + self, + text_prompt: str, + base64_image: str, + max_tokens: int = 1000, + temperature: float = 0.1, + timeout: int = 60 + ) -> str: + """ + Send a vision completion request to OpenRouter. + + Args: + text_prompt: Text prompt for the image analysis + base64_image: Base64 encoded image data + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + timeout: Request timeout in seconds + + Returns: + The AI response content as a string + """ + + # Prepare the messages with vision content + messages = [{ + "role": "user", + "content": [ + {"type": "text", "text": text_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + }] + + # Headers with required OpenRouter fields + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "HTTP-Referer": "http://localhost:8000", # Required by OpenRouter + "X-Title": "GUM AI Vision Analysis" # Recommended by OpenRouter + } + + # Request payload + payload = { + "model": self.model, + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature + } + + logger.info("OpenRouter vision completion request") + logger.info(f" Model: {self.model}") + logger.info(f" Text prompt length: {len(text_prompt)} characters") + logger.info(f" Image size: {len(base64_image)} base64 characters") + logger.info(f" Max tokens: {max_tokens}") + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = False + ssl_ctx.verify_mode = ssl.CERT_NONE + + connector = aiohttp.TCPConnector(ssl=ssl_ctx) + async with aiohttp.ClientSession(connector=connector) as session: + try: + async with session.post( + self.api_url, + headers=headers, + json=payload, + timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: + + response_text = await response.text() + + if response.status == 200: + result = json.loads(response_text) + content = result['choices'][0]['message']['content'] + + if content: + logger.info("OpenRouter vision success") + logger.info(f" Response length: {len(content)} characters") + return content + else: + error_msg = "OpenRouter returned empty response" + logger.error(f"Error: {error_msg}") + raise ValueError(error_msg) + else: + error_msg = f"OpenRouter API error {response.status}: {response_text}" + logger.error(f"Error: {error_msg}") + raise RuntimeError(error_msg) + + except asyncio.TimeoutError: + error_msg = f"OpenRouter request timeout after {timeout}s" + logger.error(f"Error: {error_msg}") + raise TimeoutError(error_msg) + except Exception as e: + error_msg = f"OpenRouter request failed: {str(e)}" + logger.error(f"Error: {error_msg}") + raise + + +# Global client instance +_openrouter_client = None + +async def get_openrouter_vision_client() -> OpenRouterVisionClient: + """Get the global OpenRouter vision client instance.""" + global _openrouter_client + if _openrouter_client is None: + _openrouter_client = OpenRouterVisionClient() + return _openrouter_client + + +async def openrouter_vision_completion( + text_prompt: str, + base64_image: str, + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for OpenRouter vision completion. + + Args: + text_prompt: Text prompt for the image analysis + base64_image: Base64 encoded image data + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_openrouter_vision_client() + return await client.vision_completion(text_prompt, base64_image, max_tokens, temperature) + + +async def test_openrouter_vision_client(): + """Test the OpenRouter vision client.""" + + print("Testing OpenRouter Vision Client...") + + # Create a simple test image (1x1 red pixel) + import base64 + from io import BytesIO + from PIL import Image + + # Create test image + img = Image.new('RGB', (100, 100), color='red') + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=85) + test_base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8') + + test_prompt = "What color is this image? Please respond with just the color name." + + try: + response = await openrouter_vision_completion( + text_prompt=test_prompt, + base64_image=test_base64_image, + max_tokens=10, + temperature=0.0 + ) + print(f"OpenRouter Vision Success: {response}") + return True + except Exception as e: + print(f"OpenRouter Vision Failed: {e}") + return False + + +if __name__ == "__main__": + success = asyncio.run(test_openrouter_vision_client()) + if success: + print("OpenRouter vision client is working!") + else: + print("OpenRouter vision client has issues.") \ No newline at end of file diff --git a/gum/unified_ai_client.py b/gum/unified_ai_client.py new file mode 100644 index 0000000..a6380e9 --- /dev/null +++ b/gum/unified_ai_client.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +""" +Unified AI Client Interface + +This utility provides a single interface for both text and vision AI completions, +automatically routing to the appropriate provider (Azure OpenAI for text, OpenRouter for vision). +Returns simple strings for easy integration. +""" + +import asyncio +import logging +import random +import time +from typing import List, Dict, Any, Optional +from dotenv import load_dotenv + +# Import aiohttp for error handling +try: + import aiohttp +except ImportError: + aiohttp = None + +# Import our specialized clients +from gum.azure_text_client import azure_text_completion +from gum.openai_text_client import openai_text_completion +from gum.openrouter_vision_client import openrouter_vision_completion +import os + +# Load environment variables at module level +load_dotenv(override=True) + +# Set up logging +logger = logging.getLogger(__name__) + + +class UnifiedAIClient: + """Unified AI client that routes requests to appropriate providers based on modality.""" + + def __init__(self, + max_retries: int = 3, + base_delay: float = 1.0, + max_delay: float = 60.0, + backoff_factor: float = 2.0, + jitter_factor: float = 0.1): + """ + Initialize the unified AI client with retry configuration. + + Args: + max_retries: Maximum number of retry attempts + base_delay: Base delay in seconds for exponential backoff + max_delay: Maximum delay in seconds between retries + backoff_factor: Exponential backoff multiplier + jitter_factor: Random jitter factor to avoid thundering herd + """ + self.max_retries = max_retries + self.base_delay = base_delay + self.max_delay = max_delay + self.backoff_factor = backoff_factor + self.jitter_factor = jitter_factor + + # Get text provider from environment (default to azure) + self.text_provider = os.getenv("TEXT_PROVIDER", "azure").lower() + + # Get vision provider from environment (default to openrouter) + self.vision_provider = os.getenv("VISION_PROVIDER", "openrouter").lower() + + logger.info("Unified AI Client initialized") + + if self.text_provider == "azure": + logger.info(" Text: Azure OpenAI") + elif self.text_provider == "openai": + logger.info(" Text: OpenAI") + else: + logger.warning(f" Unknown text provider: {self.text_provider}, defaulting to Azure OpenAI") + self.text_provider = "azure" + logger.info(" Text: Azure OpenAI") + + if self.vision_provider == "openrouter": + logger.info(" Vision: OpenRouter") + else: + logger.warning(f" Unknown vision provider: {self.vision_provider}, defaulting to OpenRouter") + self.vision_provider = "openrouter" + logger.info(" Vision: OpenRouter") + + logger.info(f" Retry config: max_retries={max_retries}, base_delay={base_delay}s, backoff_factor={backoff_factor}") + + def _calculate_delay(self, attempt: int) -> float: + """ + Calculate exponential backoff delay with jitter. + + Args: + attempt: Current attempt number (0-based) + + Returns: + Delay in seconds + """ + delay = self.base_delay * (self.backoff_factor ** attempt) + delay = min(delay, self.max_delay) + + # Add random jitter to prevent thundering herd + if self.jitter_factor > 0: + jitter = delay * self.jitter_factor * random.random() + delay += jitter + + return delay + + def _is_retryable_error(self, error: Exception) -> bool: + """ + Determine if an error should be retried. + + Args: + error: The exception that occurred + + Returns: + True if the error should be retried + """ + # Retry on specific error types + if isinstance(error, (TimeoutError, asyncio.TimeoutError)): + return True + + if isinstance(error, ValueError): + # Retry on empty response errors + error_msg = str(error).lower() + if "empty response" in error_msg or "no content" in error_msg: + return True + + if isinstance(error, RuntimeError): + # Retry on server errors (5xx) but not client errors (4xx) + error_msg = str(error).lower() + if "500" in error_msg or "502" in error_msg or "503" in error_msg or "504" in error_msg: + return True + + # Retry on connection errors + if isinstance(error, ConnectionError): + return True + + # Retry on aiohttp client errors if aiohttp is available + if aiohttp: + # Check for ClientResponseError specifically + if hasattr(aiohttp, 'ClientResponseError') and isinstance(error, aiohttp.ClientResponseError): + status_code = error.status + # Retry on 5xx server errors, timeouts, and specific connection errors + # Don't retry on 4xx client errors (400, 401, 403, etc.) + if status_code >= 500: # 5xx server errors + return True + elif status_code in [408, 429]: # Request timeout and rate limit + return True + else: + return False # Don't retry client errors (4xx) + + # Generic aiohttp ClientError (connection issues, etc.) + elif isinstance(error, aiohttp.ClientError): + return True + + # Don't retry on authentication errors, invalid requests, etc. + return False + + async def text_completion( + self, + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 + ) -> str: + """ + Handle text-only completion using the configured text provider. + + Args: + messages: List of message dictionaries (standard OpenAI format) + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + if self.text_provider == "openai": + logger.info("Routing to OpenAI for text completion") + return await openai_text_completion( + messages=messages, + max_tokens=max_tokens, + temperature=temperature + ) + else: # Default to Azure OpenAI + logger.info("Routing to Azure OpenAI for text completion") + return await azure_text_completion( + messages=messages, + max_tokens=max_tokens, + temperature=temperature + ) + + async def vision_completion( + self, + text_prompt: str, + base64_image: str, + max_tokens: int = 1000, + temperature: float = 0.1 + ) -> str: + """ + Handle vision completion using the configured provider with retry logic. + + Args: + text_prompt: Text prompt for the image analysis + base64_image: Base64 encoded image data + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + logger.info("Routing to OpenRouter for vision completion") + vision_func = openrouter_vision_completion + + last_error = None + + for attempt in range(self.max_retries + 1): # +1 for initial attempt + try: + if attempt > 0: + delay = self._calculate_delay(attempt - 1) + logger.info(f"Retry attempt {attempt}/{self.max_retries} after {delay:.2f}s delay") + await asyncio.sleep(delay) + + result = await vision_func( + text_prompt=text_prompt, + base64_image=base64_image, + max_tokens=max_tokens, + temperature=temperature + ) + + # Check if result is empty (treat as failure) + if not result or result.strip() == "": + if attempt == self.max_retries: + # Final attempt failed + logger.error(f"Vision completion failed after {self.max_retries} retries: Empty response") + raise ValueError("Empty response from vision completion after all retries") + + logger.warning(f"Vision completion returned empty response on attempt {attempt + 1}") + continue # Try again + + # Success - return the result + if attempt > 0: + logger.info(f"Vision completion succeeded on retry attempt {attempt}") + + return result + + except Exception as error: + last_error = error + + if attempt == self.max_retries: + # Final attempt failed + logger.error(f"Vision completion failed after {self.max_retries} retries: {error}") + raise error + + if not self._is_retryable_error(error): + # Error is not retryable + logger.error(f"Vision completion failed with non-retryable error: {error}") + raise error + + logger.warning(f"Vision completion failed on attempt {attempt + 1}: {error}") + + # This should never be reached, but just in case + if last_error: + raise last_error + else: + raise RuntimeError("Vision completion failed with unknown error") + + async def auto_completion( + self, + messages: Optional[List[Dict[str, Any]]] = None, + text_prompt: Optional[str] = None, + base64_image: Optional[str] = None, + max_tokens: int = 1000, + temperature: float = 0.1 + ) -> str: + """ + Automatically route to text or vision completion based on provided parameters. + + Args: + messages: List of message dictionaries for text completion + text_prompt: Text prompt for vision completion + base64_image: Base64 encoded image data for vision completion + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + if base64_image and text_prompt: + # Vision completion + logger.info("Auto-routing: Vision completion detected") + return await self.vision_completion( + text_prompt=text_prompt, + base64_image=base64_image, + max_tokens=max_tokens, + temperature=temperature + ) + elif messages: + # Text completion + logger.info("Auto-routing: Text completion detected") + return await self.text_completion( + messages=messages, + max_tokens=max_tokens, + temperature=temperature + ) + else: + raise ValueError("Must provide either (text_prompt + base64_image) for vision or (messages) for text") + + +# Global client instance +_unified_client = None + +async def get_unified_client() -> UnifiedAIClient: + """Get the global unified AI client instance.""" + global _unified_client + if _unified_client is None: + _unified_client = UnifiedAIClient() + return _unified_client + + +# Convenience functions +async def ai_text_completion( + messages: List[Dict[str, Any]], + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for text completion. + + Args: + messages: List of message dictionaries + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_unified_client() + return await client.text_completion(messages, max_tokens, temperature) + + +async def ai_vision_completion( + text_prompt: str, + base64_image: str, + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for vision completion. + + Args: + text_prompt: Text prompt for the image analysis + base64_image: Base64 encoded image data + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_unified_client() + return await client.vision_completion(text_prompt, base64_image, max_tokens, temperature) + + +async def ai_auto_completion( + messages: Optional[List[Dict[str, Any]]] = None, + text_prompt: Optional[str] = None, + base64_image: Optional[str] = None, + max_tokens: int = 1000, + temperature: float = 0.1 +) -> str: + """ + Convenience function for auto-routed completion. + + Args: + messages: List of message dictionaries for text completion + text_prompt: Text prompt for vision completion + base64_image: Base64 encoded image data for vision completion + max_tokens: Maximum tokens to generate + temperature: Temperature for generation + + Returns: + The AI response content as a string + """ + client = await get_unified_client() + return await client.auto_completion(messages, text_prompt, base64_image, max_tokens, temperature) + + +async def test_unified_client(): + """Test the unified AI client with both text and vision.""" + + print("Testing Unified AI Client...") + + # Show current configuration + text_provider = os.getenv("TEXT_PROVIDER", "azure").lower() + vision_provider = os.getenv("VISION_PROVIDER", "openrouter").lower() + print(f"Current text provider: {text_provider}") + print(f"Current vision provider: {vision_provider}") + + # Test text completion + try: + print(f"\nTesting text completion with {text_provider}...") + response = await ai_text_completion( + messages=[{"role": "user", "content": "Hello! Please respond with 'Text completion working'."}], + max_tokens=20, + temperature=0.0 + ) + print(f"Text Success: {response}") + except Exception as e: + print(f"Text Failed: {e}") + + # Test vision completion + try: + print(f"\nTesting vision completion with {vision_provider}...") + + # Create a simple test image + import base64 + from io import BytesIO + from PIL import Image + + img = Image.new('RGB', (50, 50), color='blue') + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=85) + test_base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8') + + response = await ai_vision_completion( + text_prompt="What color is this image? Just say the color.", + base64_image=test_base64_image, + max_tokens=10, + temperature=0.0 + ) + print(f"Vision Success: {response}") + except Exception as e: + print(f"Vision Failed: {e}") + test_base64_image = None # Set to None if image creation failed + + # Test auto-routing + try: + print("\nTesting auto-routing (text)...") + response = await ai_auto_completion( + messages=[{"role": "user", "content": "Say 'Auto-routing text works'."}], + max_tokens=10, + temperature=0.0 + ) + print(f"Auto-routing Text Success: {response}") + except Exception as e: + print(f"Auto-routing Text Failed: {e}") + + # Only test vision auto-routing if we have a test image + if test_base64_image: + try: + print(f"\nTesting auto-routing (vision with {vision_provider})...") + response = await ai_auto_completion( + text_prompt="What color? Just the color name.", + base64_image=test_base64_image, + max_tokens=5, + temperature=0.0 + ) + print(f"Auto-routing Vision Success: {response}") + except Exception as e: + print(f"Auto-routing Vision Failed: {e}") + else: + print("\nSkipping auto-routing vision test (no test image)") + + +if __name__ == "__main__": + # Set up logging for testing + logging.basicConfig(level=logging.INFO) + + asyncio.run(test_unified_client()) + print("\nUnified AI Client testing completed!") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 32de5e1..cb1e8a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "mss", "pynput", "shapely", - "pyobjc-framework-Quartz", + "pyobjc-framework-Quartz; sys_platform == 'darwin'", "openai>=1.0.0", "SQLAlchemy>=2.0.0", "pydantic>=2.0.0", @@ -22,6 +22,12 @@ dependencies = [ "scikit-learn", "aiosqlite", "greenlet", + "python-dateutil", + "numpy", + "setuptools>=42", + "wheel", + "build", + "twine", "persist-queue", "mkdocs>=1.5.0", "mkdocs-material>=9.0.0", diff --git a/setup.py b/setup.py index d9dfb94..2da2e0e 100644 --- a/setup.py +++ b/setup.py @@ -10,15 +10,24 @@ "mss", # For screen capture "pynput", # For mouse/keyboard monitoring "shapely", # For geometry operations - "pyobjc-framework-Quartz", # For macOS window management + "pyobjc-framework-Quartz; sys_platform == 'darwin'", # For macOS window management "openai>=1.0.0", "SQLAlchemy>=2.0.0", "pydantic>=2.0.0", - "sqlalchemy-utils>=0.41.0", "python-dotenv>=1.0.0", "scikit-learn", "aiosqlite", - "greenlet" + "greenlet", + + # Web framework and API dependencies + "fastapi", + "uvicorn", + "python-multipart", + "aiohttp", + "python-dateutil", + + # Additional dependencies for data processing + "numpy", ], entry_points={ 'console_scripts': [