Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ pip install -r requirements.txt
### 🤖 Step 2. Configure the Agent

AppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment
, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone.
, we used `gpt-4o` as the model to make decisions on how to take actions to complete a task on the smartphone.

To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.
To configure your requests to GPT-4, you should modify `config.yaml` in the root directory.
There are two key parameters that must be configured to try AppAgent:
1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.
2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency
Expand All @@ -111,6 +111,8 @@ free to use but its performance in the context of AppAgent is poorer compared wi
To use it, you should create an Alibaba Cloud account and [create a Dashscope API key](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.i1) to fill in the `DASHSCOPE_API_KEY` field
in the `config.yaml` file. Change the `MODEL` field from `OpenAI` to `Qwen` as well.

You can also try `Gemini` as another alternative multi-modal model to power the AppAgent. To use it, you should create an account on the respective platform and obtain the necessary API key to fill in the `GEMINI_API_KEY` field in the `config.yaml` file. Change the `MODEL` field to `Gemini` as well.

If you want to test AppAgent using your own models, you should write a new model class in `scripts/model.py` accordingly.

### 🔍 Step 3. Exploration Phase
Expand Down
7 changes: 5 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
MODEL: "OpenAI" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
MODEL: "Gemini" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI, Qwen or Gemini

OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
OPENAI_API_KEY: "sk-" # Set the value to sk-xxx if you host the openai interface for open llm model
OPENAI_API_MODEL: "gpt-4-vision-preview" # The only OpenAI model by now that accepts visual input
OPENAI_API_MODEL: "gpt-4o" # The only OpenAI model by now that accepts visual input
MAX_TOKENS: 300 # The max token limit for the response completion
TEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model
REQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests

DASHSCOPE_API_KEY: "sk-" # The dashscope API key that gives you access to Qwen-VL model
QWEN_MODEL: "qwen-vl-max"

GEMINI_API_KEY: "AI" # Your Gemini API key
GEMINI_MODEL: "gemini-1.5-flash" # The Gemini model to use

ANDROID_SCREENSHOT_DIR: "/sdcard" # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!
ANDROID_XML_DIR: "/sdcard" # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!

Expand Down
2 changes: 1 addition & 1 deletion scripts/and_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,6 @@ def swipe(self, x, y, direction, dist="medium", quick=False):
def swipe_precise(self, start, end, duration=400):
start_x, start_y = start
end_x, end_y = end
adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}"
adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_y} {end_x} {end_y} {duration}"
ret = execute_adb(adb_command)
return ret
5 changes: 4 additions & 1 deletion scripts/document_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import prompts
from config import load_config
from model import OpenAIModel, QwenModel
from model import OpenAIModel, QwenModel, GeminiModel
from utils import print_with_color

arg_desc = "AppAgent - Human Demonstration"
Expand All @@ -29,6 +29,9 @@
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
elif configs["MODEL"] == "Gemini":
mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
model=configs["GEMINI_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
Expand Down
34 changes: 30 additions & 4 deletions scripts/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from abc import abstractmethod
from typing import List
from typing import List, Tuple
from http import HTTPStatus

import requests
Expand All @@ -14,7 +14,7 @@ def __init__(self):
pass

@abstractmethod
def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
pass


Expand All @@ -27,7 +27,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float,
self.temperature = temperature
self.max_tokens = max_tokens

def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
content = [
{
"type": "text",
Expand Down Expand Up @@ -76,7 +76,7 @@ def __init__(self, api_key: str, model: str):
self.model = model
dashscope.api_key = api_key

def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
content = [{
"text": prompt
}]
Expand All @@ -97,6 +97,32 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
else:
return False, response.message

class GeminiModel(BaseModel):
def __init__(self, api_key: str, model: str):
super().__init__()
self.api_key = api_key
self.model = model

def get_model_response(self, prompt: str, images: List[str]) -> tuple[bool, str]:
# Implement Gemini API call here
# This is a placeholder; actual implementation depends on Gemini's API specifications
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
payload = {
"model": self.model,
"prompt": prompt,
"images": images # Assuming Gemini supports image inputs
}
try:
# Replace with actual Gemini API endpoint
response = requests.post("https://api.gemini.com/v1/completions", headers=headers, json=payload)
response.raise_for_status()
response_data = response.json()
return True, response_data["choices"][0]["message"]["content"]
except Exception as e:
return False, str(e)

def parse_explore_rsp(rsp):
try:
Expand Down
5 changes: 4 additions & 1 deletion scripts/self_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import prompts
from config import load_config
from and_controller import list_all_devices, AndroidController, traverse_tree
from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel, GeminiModel
from utils import print_with_color, draw_bbox_multi

arg_desc = "AppAgent - Autonomous Exploration"
Expand All @@ -30,6 +30,9 @@
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
elif configs["MODEL"] == "Gemini":
mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
model=configs["GEMINI_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
Expand Down