From ba400812847166c9d40768db6c30255a2861cc2d Mon Sep 17 00:00:00 2001 From: MarBeanAI Date: Sun, 27 Oct 2024 21:46:21 -0500 Subject: [PATCH] update dependencies, add activation script, and enhance system prompt --- .vscode/settings.json | 10 ++ activate.sh | 14 ++ loop.py | 262 +++++++++++++++++++++++++++------- requirements.txt | 13 +- streamlit.py | 323 +++++++++++++++++++++++++++++++----------- tools/computer.py | 158 +++++++++++++-------- 6 files changed, 583 insertions(+), 197 deletions(-) create mode 100644 .vscode/settings.json create mode 100755 activate.sh diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..e085682f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "inlineChat.finishOnType": true, + "inlineChat.experimental.enableZoneToolbar": true, + "accessibility.voice.keywordActivation": "chatInContext", + "github.copilot.chat.experimental.inlineChatCompletionTrigger.enabled": true, + "github.copilot.chat.experimental.inlineChatHint.enabled": true, + "gitlens.ai.experimental.model": "anthropic:claude-3-5-sonnet-20240620", + "gitlens.ai.experimental.openai.url": "", + "diffEditor.codeLens": true +} diff --git a/activate.sh b/activate.sh new file mode 100755 index 00000000..377e75aa --- /dev/null +++ b/activate.sh @@ -0,0 +1,14 @@ +#!/bin/bash +source venv/bin/activate +export PYTHONPATH=$PYTHONPATH:$(pwd) + +echo "Virtual environment activated!" +echo "" +echo "To start the application:" +echo "1. Set your API key:" +echo " export ANTHROPIC_API_KEY=your_api_key_here" +echo "2. Set display dimensions (recommended):" +echo " export WIDTH=1280" +echo " export HEIGHT=800" +echo "3. Run the Streamlit app:" +echo " streamlit run streamlit.py" diff --git a/loop.py b/loop.py index 887eeb6c..85768020 100644 --- a/loop.py +++ b/loop.py @@ -31,69 +31,222 @@ class APIProvider(StrEnum): ANTHROPIC = "anthropic" BEDROCK = "bedrock" VERTEX = "vertex" + BRICKS = "bricks" PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", + APIProvider.BRICKS: "claude-3-5-sonnet-20241022", } - # This system prompt is optimized for the Docker environment in this repository and # specific tool combinations enabled. # We encourage modifying this system prompt to ensure the model has context for the # environment it is running in, and to provide any additional information that may be # helpful for the task at hand. -SYSTEM_PROMPT = f""" -* You are utilizing a macOS Sonoma 15.7 environment using {platform.machine()} architecture with internet access. -* You can install applications using homebrew with your bash tool. Use curl instead of wget. -* To open Chrome, please just click on the Chrome icon in the Dock or use Spotlight. -* Using bash tool you can start GUI applications. GUI apps can be launched directly or with `open -a "Application Name"`. GUI apps will appear natively within macOS, but they may take some time to appear. Take a screenshot to confirm it did. -* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output. -* When viewing a page it can be helpful to zoom out so that you can see everything on the page. In Chrome, use Command + "-" to zoom out or Command + "+" to zoom in. -* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. -* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. - - -* When using Chrome, if any first-time setup dialogs appear, IGNORE THEM. Instead, click directly in the address bar and enter the appropriate search term or URL there. -* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext (available via homebrew) to convert it to a text file, and then read that text file directly with your StrReplaceEditTool. -""" -# SYSTEM_PROMPT = f""" -# * You are utilizing a macOS Sonoma 15.7 environment using {platform.machine()} architecture with command line internet access. -# * Package management: -# - Use homebrew for package installation -# - Use curl for HTTP requests -# - Use npm/yarn for Node.js packages -# - Use pip for Python packages - -# * Browser automation available via Playwright: -# - Supports Chrome, Firefox, and WebKit -# - Can handle JavaScript-heavy applications -# - Capable of screenshots, navigation, and interaction -# - Handles dynamic content loading - -# * System automation: -# - cliclick for simulating mouse/keyboard input -# - osascript for AppleScript commands -# - launchctl for managing services -# - defaults for reading/writing system preferences - -# * Development tools: -# - Standard Unix/Linux command line utilities -# - Git for version control -# - Docker for containerization -# - Common build tools (make, cmake, etc.) - -# * Output handling: -# - For large output, redirect to tmp files: command > /tmp/output.txt -# - Use grep with context: grep -n -B -A -# - Stream processing with awk, sed, and other text utilities - -# * Note: Command line function calls may have latency. Chain multiple operations into single requests where feasible. - -# * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. -# """ +SYSTEM_PROMPT = f""" +You are an advanced AI assistant operating within a macOS Sequoia Version 15.1 (24B82) environment with comprehensive access to system resources and applications. Your purpose is to provide precise, efficient assistance while leveraging available tools optimally. + + +1. Hardware Configuration: + - Model: MacBook Pro (15-inch, 2018) + - Processor: 2.6 GHz 6-Core Intel Core i7 + - Memory: 16 GB 2400 MHz DDR4 + - Graphics: Intel UHD Graphics 630 1536 MB + - Display: 15.4-inch Retina (2880 × 1800) + - Architecture: {platform.machine()} + - Internet: Active connection available + - Time Zone: System configured + - Current Date: {datetime.today().strftime('%A, %B %-d, %Y')} + + +1. Development Environment: + A. Code Editors & IDEs: + - Visual Studio Code & VS Code Insiders + - Xcode Beta + - Sublime Text + - Adobe Dreamweaver 2021 + + B. Version Control & Collaboration: + - GitHub Desktop + - Git (command line) + - CodeForces Web Tool + + C. Container & Virtual Environments: + - Docker.app + - Docker CLI tools + + D. Development Tools: + - Terminal + - Command Line Tools + - Developer.app + +2. Professional Suites: + A. Microsoft Office: + - Word + - Excel + - PowerPoint + - OneNote + - Outlook + + B. Adobe Creative Cloud: + - Creative Cloud Manager + - Dreamweaver 2021 + - Premiere Pro (Beta) + - Adobe UXP Developer Tools + +3. Web Browsers & Tools: + A. Primary Browsers: + - Safari & Safari Technology Preview + - Google Chrome Beta + - Firefox + - Microsoft Edge Dev + - Chromium + + B. Specialized Browsers: + - Tor Browser (Standard & Alpha) + + C. Browser Extensions: + - Grammarly for Safari + - Microsoft Bi for Safari + +4. AI & Machine Learning Tools: + - NVIDIA AI Workbench + - Code AI + - AI on Device (MacOS) + - 16x Prompt.app + +5. System Utilities: + A. File Management: + - Finder + - Preview + - The Unarchiver + - Unzip - RAR + + B. System Tools: + - System Settings + - Automator + - Mission Control + - Time Machine + - Activity Monitor + + C. Text Processing: + - TextEdit + - Notes + + D. Security: + - Passwords.app + - G Authenticator + - BitPay + - Wasabi Wallet + +6. Communication & Collaboration: + - Messages + - Mail + - FaceTime + - Discord + - Zoom + - Messenger + - TextNow + +7. Media & Entertainment: + - QuickTime Player + - Photos + - Music + - TV + - Podcasts + - Photo Booth + +8. Productivity & Organization: + - Calendar + - Reminders + - Stickies + - Clock + - Calculator + - Weather + - Maps + + +1. File System Access: + - Read/Write operations in user directories + - Application data access + - Temporary file creation + - Archive handling + +2. Network Operations: + - HTTP/HTTPS requests + - API interactions + - Download capabilities + - Network diagnostics + +3. Automation Framework: + A. System Automation: + - Shortcuts.app + - Automator workflows + - AppleScript execution + - Shell scripting + + B. Development Automation: + - Build tools + - Package managers + - Deployment scripts + +4. Security Protocols: + - Secure file operations + - Credential management + - Encryption capabilities + - Privacy controls + + +1. Resource Management: + - Monitor system resources + - Optimize heavy operations + - Cache management + - Background process awareness + +2. Error Handling: + - Graceful failure recovery + - User feedback + - Logging capabilities + - Debug information + +3. Operation Chaining: + - Minimize command calls + - Batch operations + - Efficient workflows + - Resource pooling + + +For each user interaction, I will: +1. Analyze request requirements +2. Identify optimal tools/applications +3. Validate resource availability +4. Plan execution strategy +5. Provide clear documentation +6. Monitor execution +7. Handle errors gracefully +8. Confirm successful completion + + +Each response will include: +1. tags for analysis +2. Task acknowledgment +3. Resource identification +4. Step-by-step execution plan +5. Clear documentation +6. Error handling procedures +7. Success confirmation + + +- Respect system permissions +- Handle resource constraints +- Consider operation timing +- Maintain security protocols +- Preserve user privacy +- Account for network latency""" + async def sampling_loop( *, @@ -106,7 +259,7 @@ async def sampling_loop( api_response_callback: Callable[[APIResponse[BetaMessage]], None], api_key: str, only_n_most_recent_images: int | None = None, - max_tokens: int = 4096, + max_tokens: int = 8192, ): """ Agentic sampling loop for the assistant/tool interaction of computer use. @@ -130,6 +283,13 @@ async def sampling_loop( client = AnthropicVertex() elif provider == APIProvider.BEDROCK: client = AnthropicBedrock() + elif provider == APIProvider.BRICKS: + client = Anthropic( + api_key=api_key, + base_url="https://api.trybricks.ai/api/providers/anthropic", + ) + else: + raise ValueError(f"Unsupported provider: {provider}") # Call the API # we use raw_response to provide debug information to streamlit. Your diff --git a/requirements.txt b/requirements.txt index 7457160b..f01e98fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -streamlit>=1.38.0 -anthropic[bedrock,vertex]>=0.37.1 -jsonschema==4.22.0 -boto3>=1.28.57 -google-auth<3,>=2 -python-dotenv>=1.0.1 +anthropic==0.37.1 +python-dotenv>=1.0.0 +streamlit>=1.28.0 pyautogui>=0.9.54 +keyboard>=0.13.5 +boto3>=1.29.0 +google-auth>=2.23.4 +Pillow>=10.0.0 diff --git a/streamlit.py b/streamlit.py index a57a5607..9a095b92 100644 --- a/streamlit.py +++ b/streamlit.py @@ -10,30 +10,33 @@ from enum import StrEnum from functools import partial from pathlib import PosixPath -from typing import cast +from typing import cast, Any +import json -import streamlit as st from anthropic import APIResponse -from anthropic.types import ( - TextBlock, -) -from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock +from anthropic.types import Message +from anthropic.types.beta import BetaMessage, BetaToolUseBlock from anthropic.types.tool_use_block import ToolUseBlock -from streamlit.delta_generator import DeltaGenerator +from dotenv import load_dotenv +import streamlit as st +from streamlit.components.v1 import html from loop import ( PROVIDER_TO_DEFAULT_MODEL_NAME, APIProvider, sampling_loop, ) +from streamlit.delta_generator import DeltaGenerator from tools import ToolResult -from dotenv import load_dotenv load_dotenv() +# Rest of the file remains unchanged... CONFIG_DIR = PosixPath("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" + +# Custom CSS for styling and animations STREAMLIT_STYLE = """ + +
+ + """ WARNING_TEXT = "" @@ -58,16 +155,16 @@ class Sender(StrEnum): def setup_state(): + """Initialize session state variables""" if "messages" not in st.session_state: st.session_state.messages = [] if "api_key" not in st.session_state: - # Try to load API key from file first, then environment st.session_state.api_key = load_from_storage("api_key") or os.getenv( "ANTHROPIC_API_KEY", "" ) if "provider" not in st.session_state: st.session_state.provider = ( - os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC + os.getenv("API_PROVIDER", "bricks") or APIProvider.BRICKS ) if "provider_radio" not in st.session_state: st.session_state.provider_radio = st.session_state.provider @@ -85,6 +182,8 @@ def setup_state(): st.session_state.custom_system_prompt = load_from_storage("system_prompt") or "" if "hide_images" not in st.session_state: st.session_state.hide_images = False + if "controls_enabled" not in st.session_state: + st.session_state.controls_enabled = True def _reset_model(): @@ -93,6 +192,10 @@ def _reset_model(): ] +def toggle_controls(): + st.session_state.controls_enabled = not st.session_state.controls_enabled + + async def main(): """Render loop for streamlit""" setup_state() @@ -101,7 +204,18 @@ async def main(): st.title("Claude Computer Use for Mac") - st.markdown("""This is from [Mac Computer Use](https://github.com/deedy/mac_computer_use), a fork of [Anthropic Computer Use](https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md) to work natively on Mac.""") + # User controls toggle button (only visible to users) + col1, col2 = st.columns([3, 1]) + with col2: + st.button("Toggle Controls (⌘ + Space)", on_click=toggle_controls) + if st.session_state.controls_enabled: + st.success("Controls Enabled") + else: + st.error("Controls Disabled") + + st.markdown( + """This is from [Mac Computer Use](https://github.com/deedy/mac_computer_use), a fork of [Anthropic Computer Use](https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md) to work natively on Mac.""" + ) with st.sidebar: @@ -122,9 +236,14 @@ def _reset_api_provider(): st.text_input("Model", key="model") - if st.session_state.provider == APIProvider.ANTHROPIC: + if st.session_state.provider in [APIProvider.ANTHROPIC, APIProvider.BRICKS]: + api_key_label = ( + "BricksAI Secret Key" + if st.session_state.provider == APIProvider.BRICKS + else "Anthropic API Key" + ) st.text_input( - "Anthropic API Key", + api_key_label, type="password", key="api_key", on_change=lambda: save_to_storage("api_key", st.session_state.api_key), @@ -151,7 +270,7 @@ def _reset_api_provider(): st.session_state.clear() setup_state() - subprocess.run("pkill Xvfb; pkill tint2", shell=True) # noqa: ASYNC221 + subprocess.run("pkill Xvfb; pkill tint2", shell=True, check=True) # noqa: ASYNC221 await asyncio.sleep(1) subprocess.run("./start_all.sh", shell=True) # noqa: ASYNC221 @@ -170,72 +289,84 @@ def _reset_api_provider(): ) with chat: - # render past chats - for message in st.session_state.messages: - if isinstance(message["content"], str): - _render_message(message["role"], message["content"]) - elif isinstance(message["content"], list): - for block in message["content"]: - # the tool result we send back to the Anthropic API isn't sufficient to render all details, - # so we store the tool use responses - if isinstance(block, dict) and block["type"] == "tool_result": - _render_message( - Sender.TOOL, st.session_state.tools[block["tool_use_id"]] - ) - else: - _render_message( - message["role"], - cast(BetaTextBlock | BetaToolUseBlock, block), - ) - - # render past http exchanges - for identity, response in st.session_state.responses.items(): - _render_api_response(response, identity, http_logs) - - # render past chats - if new_message: - st.session_state.messages.append( - { - "role": Sender.USER, - "content": [TextBlock(type="text", text=new_message)], - } - ) - _render_message(Sender.USER, new_message) - - try: - most_recent_message = st.session_state["messages"][-1] - except IndexError: - return - - if most_recent_message["role"] is not Sender.USER: - # we don't have a user message to respond to, exit early - return - - with st.spinner("Running Agent..."): - # run the agent sampling loop with the newest message - st.session_state.messages = await sampling_loop( - system_prompt_suffix=st.session_state.custom_system_prompt, - model=st.session_state.model, - provider=st.session_state.provider, - messages=st.session_state.messages, - output_callback=partial(_render_message, Sender.BOT), - tool_output_callback=partial( - _tool_output_callback, tool_state=st.session_state.tools - ), - api_response_callback=partial( - _api_response_callback, - tab=http_logs, - response_state=st.session_state.responses, - ), - api_key=st.session_state.api_key, - only_n_most_recent_images=st.session_state.only_n_most_recent_images, - ) + # Create a container for auto-scrolling + chat_container = st.container() + with chat_container: + # render past chats + for message in st.session_state.messages: + if isinstance(message["content"], str): + _render_message(message["role"], message["content"]) + elif isinstance(message["content"], list): + for block in message["content"]: + if isinstance(block, dict) and block["type"] == "tool_result": + _render_message( + Sender.TOOL, + st.session_state.tools[block["tool_use_id"]], + ) + else: + _render_message( + message["role"], + cast(Any, block), + ) + + # render past http exchanges + for identity, response in st.session_state.responses.items(): + _render_api_response(response, identity, http_logs) + + # render new message + if new_message: + st.session_state.messages.append( + { + "role": Sender.USER, + "content": [{"type": "text", "text": new_message}], + } + ) + _render_message(Sender.USER, new_message) + + try: + most_recent_message = st.session_state["messages"][-1] + except IndexError: + return + + if most_recent_message["role"] is not Sender.USER: + return + + with st.spinner("Running Agent..."): + st.session_state.messages = await sampling_loop( + system_prompt_suffix=st.session_state.custom_system_prompt, + model=st.session_state.model, + provider=st.session_state.provider, + messages=st.session_state.messages, + output_callback=partial(_render_message, Sender.BOT), + tool_output_callback=partial( + _tool_output_callback, tool_state=st.session_state.tools + ), + api_response_callback=partial( + _api_response_callback, + tab=http_logs, + response_state=st.session_state.responses, + ), + api_key=st.session_state.api_key, + only_n_most_recent_images=st.session_state.only_n_most_recent_images, + ) + + # Auto scroll after rendering + html(""" + + """) def validate_auth(provider: APIProvider, api_key: str | None): - if provider == APIProvider.ANTHROPIC: + if provider in [APIProvider.ANTHROPIC, APIProvider.BRICKS]: if not api_key: - return "Enter your Anthropic API key in the sidebar to continue." + key_type = ( + "BricksAI Secret Key" + if provider == APIProvider.BRICKS + else "Anthropic API Key" + ) + return f"Enter your {key_type} in the sidebar to continue." if provider == APIProvider.BEDROCK: import boto3 @@ -285,9 +416,7 @@ def _api_response_callback( tab: DeltaGenerator, response_state: dict[str, APIResponse[BetaMessage]], ): - """ - Handle an API response by storing it to state and rendering it. - """ + """Handle an API response by storing it to state and rendering it.""" response_id = datetime.now().isoformat() response_state[response_id] = response _render_api_response(response, response_id, tab) @@ -300,6 +429,33 @@ def _tool_output_callback( tool_state[tool_id] = tool_output _render_message(Sender.TOOL, tool_output) + # Update mouse tracker for mouse movements + if hasattr(tool_output, "output") and "cliclick m:" in str(tool_output.output): + coords = str(tool_output.output).split("m:")[1].strip().split(",") + if len(coords) == 2: + html(f""" + + """) + + # Show click animation for clicks + if hasattr(tool_output, "output") and any( + cmd in str(tool_output.output) for cmd in ["c:", "rc:", "dc:", "mc:"] + ): + # Get current mouse position from tracker + html(""" + + """) + def _render_api_response( response: APIResponse[BetaMessage], response_id: str, tab: DeltaGenerator @@ -320,10 +476,9 @@ def _render_api_response( def _render_message( sender: Sender, - message: str | BetaTextBlock | BetaToolUseBlock | ToolResult, + message: str | dict | BetaToolUseBlock | ToolResult, ): """Convert input from the user or output from the agent to a streamlit message.""" - # streamlit's hotreloading breaks isinstance checks, so we need to check for class names is_tool_result = not isinstance(message, str) and ( isinstance(message, ToolResult) or message.__class__.__name__ == "ToolResult" @@ -348,8 +503,8 @@ def _render_message( st.error(message.error) if message.base64_image and not st.session_state.hide_images: st.image(base64.b64decode(message.base64_image)) - elif isinstance(message, BetaTextBlock) or isinstance(message, TextBlock): - st.write(message.text) + elif isinstance(message, dict) and message.get("type") == "text": + st.write(message.get("text", "")) elif isinstance(message, BetaToolUseBlock) or isinstance(message, ToolUseBlock): st.code(f"Tool Use: {message.name}\nInput: {message.input}") else: diff --git a/tools/computer.py b/tools/computer.py index 0e7646fb..35655496 100644 --- a/tools/computer.py +++ b/tools/computer.py @@ -2,22 +2,26 @@ import base64 import os import shlex -import pyautogui -import keyboard from enum import StrEnum from pathlib import Path from typing import Literal, TypedDict from uuid import uuid4 +from io import BytesIO +from PIL import Image from anthropic.types.beta import BetaToolComputerUse20241022Param from .base import BaseAnthropicTool, ToolError, ToolResult from .run import run +# Constants OUTPUT_DIR = "/tmp/outputs" - TYPING_DELAY_MS = 12 TYPING_GROUP_SIZE = 50 +MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB in bytes + +# Check if we're running in a codespace environment +IS_CODESPACE = os.environ.get("CODESPACES") == "true" Action = Literal[ "key", @@ -63,11 +67,30 @@ def chunks(s: str, chunk_size: int) -> list[str]: return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] +def compress_image(image_data: bytes, max_size: int = MAX_IMAGE_SIZE) -> bytes: + """Compress image data until it's under the specified max size.""" + img = Image.open(BytesIO(image_data)) + quality = 95 + output = BytesIO() + + while True: + output.seek(0) + output.truncate() + img.save(output, format="PNG", optimize=True, quality=quality) + size = output.tell() + + if size <= max_size or quality <= 5: + break + + quality -= 5 + + return output.getvalue() + + class ComputerTool(BaseAnthropicTool): """ - A tool that allows the agent to interact with the screen, keyboard, and mouse of the current macOS computer. + A tool that allows the agent to interact with the screen, keyboard, and mouse. The tool parameters are defined by Anthropic and are not editable. - Requires cliclick to be installed: brew install cliclick """ name: Literal["computer"] = "computer" @@ -76,7 +99,7 @@ class ComputerTool(BaseAnthropicTool): height: int display_num: int | None - _screenshot_delay = 1.0 # macOS is generally faster than X11 + _screenshot_delay = 1.0 _scaling_enabled = True @property @@ -93,9 +116,13 @@ def to_params(self) -> BetaToolComputerUse20241022Param: def __init__(self): super().__init__() - self.width, self.height = pyautogui.size() - assert self.width and self.height, "WIDTH, HEIGHT must be set" - self.display_num = None # macOS doesn't use X11 display numbers + # Set default dimensions + self.width = int(os.environ.get("WIDTH", 1366)) + self.height = int(os.environ.get("HEIGHT", 768)) + self.display_num = None + + if IS_CODESPACE: + print("Running in codespace environment - some features may be limited") async def __call__( self, @@ -106,6 +133,12 @@ async def __call__( **kwargs, ): print("Action: ", action, text, coordinate) + + if IS_CODESPACE: + return ToolResult( + error="This action is not supported in codespace environment. This tool is designed for macOS systems." + ) + if action in ("mouse_move", "left_click_drag"): if coordinate is None: raise ToolError(f"coordinate is required for {action}") @@ -116,7 +149,9 @@ async def __call__( if not all(isinstance(i, int) and i >= 0 for i in coordinate): raise ToolError(f"{coordinate} must be a tuple of non-negative ints") - x, y = self.scale_coordinates(ScalingSource.API, coordinate[0], coordinate[1]) + x, y = self.scale_coordinates( + ScalingSource.API, coordinate[0], coordinate[1] + ) if action == "mouse_move": return await self.shell(f"cliclick m:{x},{y}") @@ -129,45 +164,44 @@ async def __call__( if coordinate is not None: raise ToolError(f"coordinate is not accepted for {action}") if not isinstance(text, str): - raise ToolError(output=f"{text} must be a string") + raise ToolError("Text input must be a string") if action == "key": - # Convert common key names to pyautogui format + # Use cliclick for key presses key_map = { - "Return": "enter", - "space": "space", - "Tab": "tab", - "Left": "left", - "Right": "right", - "Up": "up", - "Down": "down", - "Escape": "esc", - "command": "command", - "cmd": "command", - "alt": "alt", - "shift": "shift", - "ctrl": "ctrl" + "Return": "kp:return", + "space": "kp:space", + "Tab": "kp:tab", + "Left": "kp:arrow-left", + "Right": "kp:arrow-right", + "Up": "kp:arrow-up", + "Down": "kp:arrow-down", + "Escape": "kp:esc", + "command": "kp:cmd", + "cmd": "kp:cmd", + "alt": "kp:alt", + "shift": "kp:shift", + "ctrl": "kp:ctrl", } try: if "+" in text: # Handle combinations like "ctrl+c" keys = text.split("+") - mapped_keys = [key_map.get(k.strip(), k.strip()) for k in keys] - await asyncio.get_event_loop().run_in_executor( - None, keyboard.press_and_release, '+'.join(mapped_keys) - ) + mapped_keys = [ + key_map.get(k.strip(), f"kp:{k.strip()}") for k in keys + ] + cmd = "cliclick " + " ".join(mapped_keys) else: # Handle single keys - mapped_key = key_map.get(text, text) - await asyncio.get_event_loop().run_in_executor( - None, keyboard.press_and_release, mapped_key - ) + mapped_key = key_map.get(text, f"kp:{text}") + cmd = f"cliclick {mapped_key}" - return ToolResult(output=f"Pressed key: {text}", error=None, base64_image=None) + return await self.shell(cmd) except Exception as e: - return ToolResult(output=None, error=str(e), base64_image=None) + return ToolResult(error=str(e)) + elif action == "type": results: list[ToolResult] = [] for chunk in chunks(text, TYPING_GROUP_SIZE): @@ -200,7 +234,6 @@ async def __call__( "cliclick p", take_screenshot=False, ) - import pdb; pdb.set_trace() if result.output: x, y = map(int, result.output.strip().split(",")) x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y) @@ -219,26 +252,35 @@ async def __call__( async def screenshot(self): """Take a screenshot of the current screen and return the base64 encoded image.""" + if IS_CODESPACE: + return ToolResult( + error="Screenshot functionality is not available in codespace environment" + ) + output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) path = output_dir / f"screenshot_{uuid4().hex}.png" - # Use macOS native screencapture - screenshot_cmd = f"screencapture -x {path}" - result = await self.shell(screenshot_cmd, take_screenshot=False) + try: + # Use screencapture on macOS + result = await self.shell(f"screencapture -x {path}") + if result.error: + return result - if self._scaling_enabled: - x, y = SCALE_DESTINATION['width'], SCALE_DESTINATION['height'] - await self.shell( - f"sips -z {y} {x} {path}", # sips is macOS native image processor - take_screenshot=False - ) - - if path.exists(): - return result.replace( - base64_image=base64.b64encode(path.read_bytes()).decode() - ) - raise ToolError(f"Failed to take screenshot: {result.error}") + if path.exists(): + # Read the image and compress if necessary + image_data = path.read_bytes() + if len(image_data) > MAX_IMAGE_SIZE: + image_data = compress_image(image_data) + + return ToolResult(base64_image=base64.b64encode(image_data).decode()) + return ToolResult(error="Screenshot file was not created") + except Exception as e: + return ToolResult(error=f"Failed to take screenshot: {str(e)}") + finally: + # Clean up the temporary file + if path.exists(): + path.unlink() async def shell(self, command: str, take_screenshot=False) -> ToolResult: """Run a shell command and return the output, error, and optionally a screenshot.""" @@ -252,7 +294,9 @@ async def shell(self, command: str, take_screenshot=False) -> ToolResult: return ToolResult(output=stdout, error=stderr, base64_image=base64_image) - def scale_coordinates(self, source: ScalingSource, x: int, y: int) -> tuple[int, int]: + def scale_coordinates( + self, source: ScalingSource, x: int, y: int + ) -> tuple[int, int]: """ Scale coordinates between original resolution and target resolution (SCALE_DESTINATION). @@ -268,13 +312,15 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int) -> tuple[int, return x, y # Calculate scaling factors - x_scaling_factor = SCALE_DESTINATION['width'] / self.width - y_scaling_factor = SCALE_DESTINATION['height'] / self.height + x_scaling_factor = SCALE_DESTINATION["width"] / self.width + y_scaling_factor = SCALE_DESTINATION["height"] / self.height if source == ScalingSource.API: # Scale up from SCALE_DESTINATION to original resolution - if x > SCALE_DESTINATION['width'] or y > SCALE_DESTINATION['height']: - raise ToolError(f"Coordinates {x}, {y} are out of bounds for {SCALE_DESTINATION['width']}x{SCALE_DESTINATION['height']}") + if x > SCALE_DESTINATION["width"] or y > SCALE_DESTINATION["height"]: + raise ToolError( + f"Coordinates {x}, {y} are out of bounds for {SCALE_DESTINATION['width']}x{SCALE_DESTINATION['height']}" + ) return round(x / x_scaling_factor), round(y / y_scaling_factor) else: # Scale down from original resolution to SCALE_DESTINATION