diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..e085682f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+ "inlineChat.finishOnType": true,
+ "inlineChat.experimental.enableZoneToolbar": true,
+ "accessibility.voice.keywordActivation": "chatInContext",
+ "github.copilot.chat.experimental.inlineChatCompletionTrigger.enabled": true,
+ "github.copilot.chat.experimental.inlineChatHint.enabled": true,
+ "gitlens.ai.experimental.model": "anthropic:claude-3-5-sonnet-20240620",
+ "gitlens.ai.experimental.openai.url": "",
+ "diffEditor.codeLens": true
+}
diff --git a/activate.sh b/activate.sh
new file mode 100755
index 00000000..377e75aa
--- /dev/null
+++ b/activate.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+source venv/bin/activate
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+
+echo "Virtual environment activated!"
+echo ""
+echo "To start the application:"
+echo "1. Set your API key:"
+echo " export ANTHROPIC_API_KEY=your_api_key_here"
+echo "2. Set display dimensions (recommended):"
+echo " export WIDTH=1280"
+echo " export HEIGHT=800"
+echo "3. Run the Streamlit app:"
+echo " streamlit run streamlit.py"
diff --git a/loop.py b/loop.py
index 263328c6..89c3aa9f 100644
--- a/loop.py
+++ b/loop.py
@@ -31,69 +31,222 @@ class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
+ BRICKS = "bricks"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
+ APIProvider.BRICKS: "claude-3-5-sonnet-20241022",
}
-
# This system prompt is optimized for the Docker environment in this repository and
# specific tool combinations enabled.
# We encourage modifying this system prompt to ensure the model has context for the
# environment it is running in, and to provide any additional information that may be
# helpful for the task at hand.
-# SYSTEM_PROMPT = f"""
-# * You are utilizing a macOS Sonoma 15.7 environment using {platform.machine()} architecture with internet access.
-# * You can install applications using homebrew with your bash tool. Use curl instead of wget.
-# * To open Chrome, please just click on the Chrome icon in the Dock or use Spotlight.
-# * Using bash tool you can start GUI applications. GUI apps can be launched directly or with `open -a "Application Name"`. GUI apps will appear natively within macOS, but they may take some time to appear. Take a screenshot to confirm it did.
-# * When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output.
-# * When viewing a page it can be helpful to zoom out so that you can see everything on the page. In Chrome, use Command + "-" to zoom out or Command + "+" to zoom in.
-# * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
-# * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
-#
-#
-# * When using Chrome, if any first-time setup dialogs appear, IGNORE THEM. Instead, click directly in the address bar and enter the appropriate search term or URL there.
-# * If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext (available via homebrew) to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
-# """
-SYSTEM_PROMPT = f"""
-* You are utilizing a macOS Sonoma 15.7 environment using {platform.machine()} architecture with command line internet access.
-* Package management:
- - Use homebrew for package installation
- - Use curl for HTTP requests
- - Use npm/yarn for Node.js packages
- - Use pip for Python packages
-
-* Browser automation available via Playwright:
- - Supports Chrome, Firefox, and WebKit
- - Can handle JavaScript-heavy applications
- - Capable of screenshots, navigation, and interaction
- - Handles dynamic content loading
-
-* System automation:
- - cliclick for simulating mouse/keyboard input
- - osascript for AppleScript commands
- - launchctl for managing services
- - defaults for reading/writing system preferences
-
-* Development tools:
- - Standard Unix/Linux command line utilities
- - Git for version control
- - Docker for containerization
- - Common build tools (make, cmake, etc.)
-
-* Output handling:
- - For large output, redirect to tmp files: command > /tmp/output.txt
- - Use grep with context: grep -n -B -A
- - Stream processing with awk, sed, and other text utilities
-
-* Note: Command line function calls may have latency. Chain multiple operations into single requests where feasible.
-
-* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
-"""
+
+SYSTEM_PROMPT = f"""
+You are an advanced AI assistant operating within a macOS Sequoia Version 15.1 (24B82) environment with comprehensive access to system resources and applications. Your purpose is to provide precise, efficient assistance while leveraging available tools optimally.
+
+
+1. Hardware Configuration:
+ - Model: MacBook Pro (15-inch, 2018)
+ - Processor: 2.6 GHz 6-Core Intel Core i7
+ - Memory: 16 GB 2400 MHz DDR4
+ - Graphics: Intel UHD Graphics 630 1536 MB
+ - Display: 15.4-inch Retina (2880 × 1800)
+ - Architecture: {platform.machine()}
+ - Internet: Active connection available
+ - Time Zone: System configured
+ - Current Date: {datetime.today().strftime('%A, %B %-d, %Y')}
+
+
+1. Development Environment:
+ A. Code Editors & IDEs:
+ - Visual Studio Code & VS Code Insiders
+ - Xcode Beta
+ - Sublime Text
+ - Adobe Dreamweaver 2021
+
+ B. Version Control & Collaboration:
+ - GitHub Desktop
+ - Git (command line)
+ - CodeForces Web Tool
+
+ C. Container & Virtual Environments:
+ - Docker.app
+ - Docker CLI tools
+
+ D. Development Tools:
+ - Terminal
+ - Command Line Tools
+ - Developer.app
+
+2. Professional Suites:
+ A. Microsoft Office:
+ - Word
+ - Excel
+ - PowerPoint
+ - OneNote
+ - Outlook
+
+ B. Adobe Creative Cloud:
+ - Creative Cloud Manager
+ - Dreamweaver 2021
+ - Premiere Pro (Beta)
+ - Adobe UXP Developer Tools
+
+3. Web Browsers & Tools:
+ A. Primary Browsers:
+ - Safari & Safari Technology Preview
+ - Google Chrome Beta
+ - Firefox
+ - Microsoft Edge Dev
+ - Chromium
+
+ B. Specialized Browsers:
+ - Tor Browser (Standard & Alpha)
+
+ C. Browser Extensions:
+ - Grammarly for Safari
+ - Microsoft Bi for Safari
+
+4. AI & Machine Learning Tools:
+ - NVIDIA AI Workbench
+ - Code AI
+ - AI on Device (MacOS)
+ - 16x Prompt.app
+
+5. System Utilities:
+ A. File Management:
+ - Finder
+ - Preview
+ - The Unarchiver
+ - Unzip - RAR
+
+ B. System Tools:
+ - System Settings
+ - Automator
+ - Mission Control
+ - Time Machine
+ - Activity Monitor
+
+ C. Text Processing:
+ - TextEdit
+ - Notes
+
+ D. Security:
+ - Passwords.app
+ - G Authenticator
+ - BitPay
+ - Wasabi Wallet
+
+6. Communication & Collaboration:
+ - Messages
+ - Mail
+ - FaceTime
+ - Discord
+ - Zoom
+ - Messenger
+ - TextNow
+
+7. Media & Entertainment:
+ - QuickTime Player
+ - Photos
+ - Music
+ - TV
+ - Podcasts
+ - Photo Booth
+
+8. Productivity & Organization:
+ - Calendar
+ - Reminders
+ - Stickies
+ - Clock
+ - Calculator
+ - Weather
+ - Maps
+
+
+1. File System Access:
+ - Read/Write operations in user directories
+ - Application data access
+ - Temporary file creation
+ - Archive handling
+
+2. Network Operations:
+ - HTTP/HTTPS requests
+ - API interactions
+ - Download capabilities
+ - Network diagnostics
+
+3. Automation Framework:
+ A. System Automation:
+ - Shortcuts.app
+ - Automator workflows
+ - AppleScript execution
+ - Shell scripting
+
+ B. Development Automation:
+ - Build tools
+ - Package managers
+ - Deployment scripts
+
+4. Security Protocols:
+ - Secure file operations
+ - Credential management
+ - Encryption capabilities
+ - Privacy controls
+
+
+1. Resource Management:
+ - Monitor system resources
+ - Optimize heavy operations
+ - Cache management
+ - Background process awareness
+
+2. Error Handling:
+ - Graceful failure recovery
+ - User feedback
+ - Logging capabilities
+ - Debug information
+
+3. Operation Chaining:
+ - Minimize command calls
+ - Batch operations
+ - Efficient workflows
+ - Resource pooling
+
+
+For each user interaction, I will:
+1. Analyze request requirements
+2. Identify optimal tools/applications
+3. Validate resource availability
+4. Plan execution strategy
+5. Provide clear documentation
+6. Monitor execution
+7. Handle errors gracefully
+8. Confirm successful completion
+
+
+Each response will include:
+1. tags for analysis
+2. Task acknowledgment
+3. Resource identification
+4. Step-by-step execution plan
+5. Clear documentation
+6. Error handling procedures
+7. Success confirmation
+
+
+- Respect system permissions
+- Handle resource constraints
+- Consider operation timing
+- Maintain security protocols
+- Preserve user privacy
+- Account for network latency"""
async def sampling_loop(
*,
@@ -106,7 +259,7 @@ async def sampling_loop(
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
api_key: str,
only_n_most_recent_images: int | None = None,
- max_tokens: int = 4096,
+ max_tokens: int = 8192,
):
"""
Agentic sampling loop for the assistant/tool interaction of computer use.
@@ -130,6 +283,13 @@ async def sampling_loop(
client = AnthropicVertex()
elif provider == APIProvider.BEDROCK:
client = AnthropicBedrock()
+ elif provider == APIProvider.BRICKS:
+ client = Anthropic(
+ api_key=api_key,
+ base_url="https://api.trybricks.ai/api/providers/anthropic",
+ )
+ else:
+ raise ValueError(f"Unsupported provider: {provider}")
# Call the API
# we use raw_response to provide debug information to streamlit. Your
diff --git a/streamlit.py b/streamlit.py
index a57a5607..9a095b92 100644
--- a/streamlit.py
+++ b/streamlit.py
@@ -10,30 +10,33 @@
from enum import StrEnum
from functools import partial
from pathlib import PosixPath
-from typing import cast
+from typing import cast, Any
+import json
-import streamlit as st
from anthropic import APIResponse
-from anthropic.types import (
- TextBlock,
-)
-from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+from anthropic.types import Message
+from anthropic.types.beta import BetaMessage, BetaToolUseBlock
from anthropic.types.tool_use_block import ToolUseBlock
-from streamlit.delta_generator import DeltaGenerator
+from dotenv import load_dotenv
+import streamlit as st
+from streamlit.components.v1 import html
from loop import (
PROVIDER_TO_DEFAULT_MODEL_NAME,
APIProvider,
sampling_loop,
)
+from streamlit.delta_generator import DeltaGenerator
from tools import ToolResult
-from dotenv import load_dotenv
load_dotenv()
+# Rest of the file remains unchanged...
CONFIG_DIR = PosixPath("~/.anthropic").expanduser()
API_KEY_FILE = CONFIG_DIR / "api_key"
+
+# Custom CSS for styling and animations
STREAMLIT_STYLE = """
+
+
+
+
"""
WARNING_TEXT = ""
@@ -58,16 +155,16 @@ class Sender(StrEnum):
def setup_state():
+ """Initialize session state variables"""
if "messages" not in st.session_state:
st.session_state.messages = []
if "api_key" not in st.session_state:
- # Try to load API key from file first, then environment
st.session_state.api_key = load_from_storage("api_key") or os.getenv(
"ANTHROPIC_API_KEY", ""
)
if "provider" not in st.session_state:
st.session_state.provider = (
- os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC
+ os.getenv("API_PROVIDER", "bricks") or APIProvider.BRICKS
)
if "provider_radio" not in st.session_state:
st.session_state.provider_radio = st.session_state.provider
@@ -85,6 +182,8 @@ def setup_state():
st.session_state.custom_system_prompt = load_from_storage("system_prompt") or ""
if "hide_images" not in st.session_state:
st.session_state.hide_images = False
+ if "controls_enabled" not in st.session_state:
+ st.session_state.controls_enabled = True
def _reset_model():
@@ -93,6 +192,10 @@ def _reset_model():
]
+def toggle_controls():
+ st.session_state.controls_enabled = not st.session_state.controls_enabled
+
+
async def main():
"""Render loop for streamlit"""
setup_state()
@@ -101,7 +204,18 @@ async def main():
st.title("Claude Computer Use for Mac")
- st.markdown("""This is from [Mac Computer Use](https://github.com/deedy/mac_computer_use), a fork of [Anthropic Computer Use](https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md) to work natively on Mac.""")
+ # User controls toggle button (only visible to users)
+ col1, col2 = st.columns([3, 1])
+ with col2:
+ st.button("Toggle Controls (⌘ + Space)", on_click=toggle_controls)
+ if st.session_state.controls_enabled:
+ st.success("Controls Enabled")
+ else:
+ st.error("Controls Disabled")
+
+ st.markdown(
+ """This is from [Mac Computer Use](https://github.com/deedy/mac_computer_use), a fork of [Anthropic Computer Use](https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md) to work natively on Mac."""
+ )
with st.sidebar:
@@ -122,9 +236,14 @@ def _reset_api_provider():
st.text_input("Model", key="model")
- if st.session_state.provider == APIProvider.ANTHROPIC:
+ if st.session_state.provider in [APIProvider.ANTHROPIC, APIProvider.BRICKS]:
+ api_key_label = (
+ "BricksAI Secret Key"
+ if st.session_state.provider == APIProvider.BRICKS
+ else "Anthropic API Key"
+ )
st.text_input(
- "Anthropic API Key",
+ api_key_label,
type="password",
key="api_key",
on_change=lambda: save_to_storage("api_key", st.session_state.api_key),
@@ -151,7 +270,7 @@ def _reset_api_provider():
st.session_state.clear()
setup_state()
- subprocess.run("pkill Xvfb; pkill tint2", shell=True) # noqa: ASYNC221
+ subprocess.run("pkill Xvfb; pkill tint2", shell=True, check=True) # noqa: ASYNC221
await asyncio.sleep(1)
subprocess.run("./start_all.sh", shell=True) # noqa: ASYNC221
@@ -170,72 +289,84 @@ def _reset_api_provider():
)
with chat:
- # render past chats
- for message in st.session_state.messages:
- if isinstance(message["content"], str):
- _render_message(message["role"], message["content"])
- elif isinstance(message["content"], list):
- for block in message["content"]:
- # the tool result we send back to the Anthropic API isn't sufficient to render all details,
- # so we store the tool use responses
- if isinstance(block, dict) and block["type"] == "tool_result":
- _render_message(
- Sender.TOOL, st.session_state.tools[block["tool_use_id"]]
- )
- else:
- _render_message(
- message["role"],
- cast(BetaTextBlock | BetaToolUseBlock, block),
- )
-
- # render past http exchanges
- for identity, response in st.session_state.responses.items():
- _render_api_response(response, identity, http_logs)
-
- # render past chats
- if new_message:
- st.session_state.messages.append(
- {
- "role": Sender.USER,
- "content": [TextBlock(type="text", text=new_message)],
- }
- )
- _render_message(Sender.USER, new_message)
-
- try:
- most_recent_message = st.session_state["messages"][-1]
- except IndexError:
- return
-
- if most_recent_message["role"] is not Sender.USER:
- # we don't have a user message to respond to, exit early
- return
-
- with st.spinner("Running Agent..."):
- # run the agent sampling loop with the newest message
- st.session_state.messages = await sampling_loop(
- system_prompt_suffix=st.session_state.custom_system_prompt,
- model=st.session_state.model,
- provider=st.session_state.provider,
- messages=st.session_state.messages,
- output_callback=partial(_render_message, Sender.BOT),
- tool_output_callback=partial(
- _tool_output_callback, tool_state=st.session_state.tools
- ),
- api_response_callback=partial(
- _api_response_callback,
- tab=http_logs,
- response_state=st.session_state.responses,
- ),
- api_key=st.session_state.api_key,
- only_n_most_recent_images=st.session_state.only_n_most_recent_images,
- )
+ # Create a container for auto-scrolling
+ chat_container = st.container()
+ with chat_container:
+ # render past chats
+ for message in st.session_state.messages:
+ if isinstance(message["content"], str):
+ _render_message(message["role"], message["content"])
+ elif isinstance(message["content"], list):
+ for block in message["content"]:
+ if isinstance(block, dict) and block["type"] == "tool_result":
+ _render_message(
+ Sender.TOOL,
+ st.session_state.tools[block["tool_use_id"]],
+ )
+ else:
+ _render_message(
+ message["role"],
+ cast(Any, block),
+ )
+
+ # render past http exchanges
+ for identity, response in st.session_state.responses.items():
+ _render_api_response(response, identity, http_logs)
+
+ # render new message
+ if new_message:
+ st.session_state.messages.append(
+ {
+ "role": Sender.USER,
+ "content": [{"type": "text", "text": new_message}],
+ }
+ )
+ _render_message(Sender.USER, new_message)
+
+ try:
+ most_recent_message = st.session_state["messages"][-1]
+ except IndexError:
+ return
+
+ if most_recent_message["role"] is not Sender.USER:
+ return
+
+ with st.spinner("Running Agent..."):
+ st.session_state.messages = await sampling_loop(
+ system_prompt_suffix=st.session_state.custom_system_prompt,
+ model=st.session_state.model,
+ provider=st.session_state.provider,
+ messages=st.session_state.messages,
+ output_callback=partial(_render_message, Sender.BOT),
+ tool_output_callback=partial(
+ _tool_output_callback, tool_state=st.session_state.tools
+ ),
+ api_response_callback=partial(
+ _api_response_callback,
+ tab=http_logs,
+ response_state=st.session_state.responses,
+ ),
+ api_key=st.session_state.api_key,
+ only_n_most_recent_images=st.session_state.only_n_most_recent_images,
+ )
+
+ # Auto scroll after rendering
+ html("""
+
+ """)
def validate_auth(provider: APIProvider, api_key: str | None):
- if provider == APIProvider.ANTHROPIC:
+ if provider in [APIProvider.ANTHROPIC, APIProvider.BRICKS]:
if not api_key:
- return "Enter your Anthropic API key in the sidebar to continue."
+ key_type = (
+ "BricksAI Secret Key"
+ if provider == APIProvider.BRICKS
+ else "Anthropic API Key"
+ )
+ return f"Enter your {key_type} in the sidebar to continue."
if provider == APIProvider.BEDROCK:
import boto3
@@ -285,9 +416,7 @@ def _api_response_callback(
tab: DeltaGenerator,
response_state: dict[str, APIResponse[BetaMessage]],
):
- """
- Handle an API response by storing it to state and rendering it.
- """
+ """Handle an API response by storing it to state and rendering it."""
response_id = datetime.now().isoformat()
response_state[response_id] = response
_render_api_response(response, response_id, tab)
@@ -300,6 +429,33 @@ def _tool_output_callback(
tool_state[tool_id] = tool_output
_render_message(Sender.TOOL, tool_output)
+ # Update mouse tracker for mouse movements
+ if hasattr(tool_output, "output") and "cliclick m:" in str(tool_output.output):
+ coords = str(tool_output.output).split("m:")[1].strip().split(",")
+ if len(coords) == 2:
+ html(f"""
+
+ """)
+
+ # Show click animation for clicks
+ if hasattr(tool_output, "output") and any(
+ cmd in str(tool_output.output) for cmd in ["c:", "rc:", "dc:", "mc:"]
+ ):
+ # Get current mouse position from tracker
+ html("""
+
+ """)
+
def _render_api_response(
response: APIResponse[BetaMessage], response_id: str, tab: DeltaGenerator
@@ -320,10 +476,9 @@ def _render_api_response(
def _render_message(
sender: Sender,
- message: str | BetaTextBlock | BetaToolUseBlock | ToolResult,
+ message: str | dict | BetaToolUseBlock | ToolResult,
):
"""Convert input from the user or output from the agent to a streamlit message."""
- # streamlit's hotreloading breaks isinstance checks, so we need to check for class names
is_tool_result = not isinstance(message, str) and (
isinstance(message, ToolResult)
or message.__class__.__name__ == "ToolResult"
@@ -348,8 +503,8 @@ def _render_message(
st.error(message.error)
if message.base64_image and not st.session_state.hide_images:
st.image(base64.b64decode(message.base64_image))
- elif isinstance(message, BetaTextBlock) or isinstance(message, TextBlock):
- st.write(message.text)
+ elif isinstance(message, dict) and message.get("type") == "text":
+ st.write(message.get("text", ""))
elif isinstance(message, BetaToolUseBlock) or isinstance(message, ToolUseBlock):
st.code(f"Tool Use: {message.name}\nInput: {message.input}")
else:
diff --git a/tools/computer.py b/tools/computer.py
index 0e7646fb..35655496 100644
--- a/tools/computer.py
+++ b/tools/computer.py
@@ -2,22 +2,26 @@
import base64
import os
import shlex
-import pyautogui
-import keyboard
from enum import StrEnum
from pathlib import Path
from typing import Literal, TypedDict
from uuid import uuid4
+from io import BytesIO
+from PIL import Image
from anthropic.types.beta import BetaToolComputerUse20241022Param
from .base import BaseAnthropicTool, ToolError, ToolResult
from .run import run
+# Constants
OUTPUT_DIR = "/tmp/outputs"
-
TYPING_DELAY_MS = 12
TYPING_GROUP_SIZE = 50
+MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB in bytes
+
+# Check if we're running in a codespace environment
+IS_CODESPACE = os.environ.get("CODESPACES") == "true"
Action = Literal[
"key",
@@ -63,11 +67,30 @@ def chunks(s: str, chunk_size: int) -> list[str]:
return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
+def compress_image(image_data: bytes, max_size: int = MAX_IMAGE_SIZE) -> bytes:
+ """Compress image data until it's under the specified max size."""
+ img = Image.open(BytesIO(image_data))
+ quality = 95
+ output = BytesIO()
+
+ while True:
+ output.seek(0)
+ output.truncate()
+ img.save(output, format="PNG", optimize=True, quality=quality)
+ size = output.tell()
+
+ if size <= max_size or quality <= 5:
+ break
+
+ quality -= 5
+
+ return output.getvalue()
+
+
class ComputerTool(BaseAnthropicTool):
"""
- A tool that allows the agent to interact with the screen, keyboard, and mouse of the current macOS computer.
+ A tool that allows the agent to interact with the screen, keyboard, and mouse.
The tool parameters are defined by Anthropic and are not editable.
- Requires cliclick to be installed: brew install cliclick
"""
name: Literal["computer"] = "computer"
@@ -76,7 +99,7 @@ class ComputerTool(BaseAnthropicTool):
height: int
display_num: int | None
- _screenshot_delay = 1.0 # macOS is generally faster than X11
+ _screenshot_delay = 1.0
_scaling_enabled = True
@property
@@ -93,9 +116,13 @@ def to_params(self) -> BetaToolComputerUse20241022Param:
def __init__(self):
super().__init__()
- self.width, self.height = pyautogui.size()
- assert self.width and self.height, "WIDTH, HEIGHT must be set"
- self.display_num = None # macOS doesn't use X11 display numbers
+ # Set default dimensions
+ self.width = int(os.environ.get("WIDTH", 1366))
+ self.height = int(os.environ.get("HEIGHT", 768))
+ self.display_num = None
+
+ if IS_CODESPACE:
+ print("Running in codespace environment - some features may be limited")
async def __call__(
self,
@@ -106,6 +133,12 @@ async def __call__(
**kwargs,
):
print("Action: ", action, text, coordinate)
+
+ if IS_CODESPACE:
+ return ToolResult(
+ error="This action is not supported in codespace environment. This tool is designed for macOS systems."
+ )
+
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
raise ToolError(f"coordinate is required for {action}")
@@ -116,7 +149,9 @@ async def __call__(
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
- x, y = self.scale_coordinates(ScalingSource.API, coordinate[0], coordinate[1])
+ x, y = self.scale_coordinates(
+ ScalingSource.API, coordinate[0], coordinate[1]
+ )
if action == "mouse_move":
return await self.shell(f"cliclick m:{x},{y}")
@@ -129,45 +164,44 @@ async def __call__(
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if not isinstance(text, str):
- raise ToolError(output=f"{text} must be a string")
+ raise ToolError("Text input must be a string")
if action == "key":
- # Convert common key names to pyautogui format
+ # Use cliclick for key presses
key_map = {
- "Return": "enter",
- "space": "space",
- "Tab": "tab",
- "Left": "left",
- "Right": "right",
- "Up": "up",
- "Down": "down",
- "Escape": "esc",
- "command": "command",
- "cmd": "command",
- "alt": "alt",
- "shift": "shift",
- "ctrl": "ctrl"
+ "Return": "kp:return",
+ "space": "kp:space",
+ "Tab": "kp:tab",
+ "Left": "kp:arrow-left",
+ "Right": "kp:arrow-right",
+ "Up": "kp:arrow-up",
+ "Down": "kp:arrow-down",
+ "Escape": "kp:esc",
+ "command": "kp:cmd",
+ "cmd": "kp:cmd",
+ "alt": "kp:alt",
+ "shift": "kp:shift",
+ "ctrl": "kp:ctrl",
}
try:
if "+" in text:
# Handle combinations like "ctrl+c"
keys = text.split("+")
- mapped_keys = [key_map.get(k.strip(), k.strip()) for k in keys]
- await asyncio.get_event_loop().run_in_executor(
- None, keyboard.press_and_release, '+'.join(mapped_keys)
- )
+ mapped_keys = [
+ key_map.get(k.strip(), f"kp:{k.strip()}") for k in keys
+ ]
+ cmd = "cliclick " + " ".join(mapped_keys)
else:
# Handle single keys
- mapped_key = key_map.get(text, text)
- await asyncio.get_event_loop().run_in_executor(
- None, keyboard.press_and_release, mapped_key
- )
+ mapped_key = key_map.get(text, f"kp:{text}")
+ cmd = f"cliclick {mapped_key}"
- return ToolResult(output=f"Pressed key: {text}", error=None, base64_image=None)
+ return await self.shell(cmd)
except Exception as e:
- return ToolResult(output=None, error=str(e), base64_image=None)
+ return ToolResult(error=str(e))
+
elif action == "type":
results: list[ToolResult] = []
for chunk in chunks(text, TYPING_GROUP_SIZE):
@@ -200,7 +234,6 @@ async def __call__(
"cliclick p",
take_screenshot=False,
)
- import pdb; pdb.set_trace()
if result.output:
x, y = map(int, result.output.strip().split(","))
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
@@ -219,26 +252,35 @@ async def __call__(
async def screenshot(self):
"""Take a screenshot of the current screen and return the base64 encoded image."""
+ if IS_CODESPACE:
+ return ToolResult(
+ error="Screenshot functionality is not available in codespace environment"
+ )
+
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / f"screenshot_{uuid4().hex}.png"
- # Use macOS native screencapture
- screenshot_cmd = f"screencapture -x {path}"
- result = await self.shell(screenshot_cmd, take_screenshot=False)
+ try:
+ # Use screencapture on macOS
+ result = await self.shell(f"screencapture -x {path}")
+ if result.error:
+ return result
- if self._scaling_enabled:
- x, y = SCALE_DESTINATION['width'], SCALE_DESTINATION['height']
- await self.shell(
- f"sips -z {y} {x} {path}", # sips is macOS native image processor
- take_screenshot=False
- )
-
- if path.exists():
- return result.replace(
- base64_image=base64.b64encode(path.read_bytes()).decode()
- )
- raise ToolError(f"Failed to take screenshot: {result.error}")
+ if path.exists():
+ # Read the image and compress if necessary
+ image_data = path.read_bytes()
+ if len(image_data) > MAX_IMAGE_SIZE:
+ image_data = compress_image(image_data)
+
+ return ToolResult(base64_image=base64.b64encode(image_data).decode())
+ return ToolResult(error="Screenshot file was not created")
+ except Exception as e:
+ return ToolResult(error=f"Failed to take screenshot: {str(e)}")
+ finally:
+ # Clean up the temporary file
+ if path.exists():
+ path.unlink()
async def shell(self, command: str, take_screenshot=False) -> ToolResult:
"""Run a shell command and return the output, error, and optionally a screenshot."""
@@ -252,7 +294,9 @@ async def shell(self, command: str, take_screenshot=False) -> ToolResult:
return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
- def scale_coordinates(self, source: ScalingSource, x: int, y: int) -> tuple[int, int]:
+ def scale_coordinates(
+ self, source: ScalingSource, x: int, y: int
+ ) -> tuple[int, int]:
"""
Scale coordinates between original resolution and target resolution (SCALE_DESTINATION).
@@ -268,13 +312,15 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int) -> tuple[int,
return x, y
# Calculate scaling factors
- x_scaling_factor = SCALE_DESTINATION['width'] / self.width
- y_scaling_factor = SCALE_DESTINATION['height'] / self.height
+ x_scaling_factor = SCALE_DESTINATION["width"] / self.width
+ y_scaling_factor = SCALE_DESTINATION["height"] / self.height
if source == ScalingSource.API:
# Scale up from SCALE_DESTINATION to original resolution
- if x > SCALE_DESTINATION['width'] or y > SCALE_DESTINATION['height']:
- raise ToolError(f"Coordinates {x}, {y} are out of bounds for {SCALE_DESTINATION['width']}x{SCALE_DESTINATION['height']}")
+ if x > SCALE_DESTINATION["width"] or y > SCALE_DESTINATION["height"]:
+ raise ToolError(
+ f"Coordinates {x}, {y} are out of bounds for {SCALE_DESTINATION['width']}x{SCALE_DESTINATION['height']}"
+ )
return round(x / x_scaling_factor), round(y / y_scaling_factor)
else:
# Scale down from original resolution to SCALE_DESTINATION