TencentQQGYLab · Hemanth0411 · May 12, 2025 · May 12, 2025
diff --git a/README.md b/README.md
@@ -93,9 +93,9 @@ pip install -r requirements.txt
 ### 🤖 Step 2. Configure the Agent
 
 AppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment
-, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone.
+, we used `gpt-4o` as the model to make decisions on how to take actions to complete a task on the smartphone.
 
-To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.
+To configure your requests to GPT-4, you should modify `config.yaml` in the root directory.
 There are two key parameters that must be configured to try AppAgent:
 1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.
 2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency 
@@ -111,6 +111,8 @@ free to use but its performance in the context of AppAgent is poorer compared wi
 To use it, you should create an Alibaba Cloud account and [create a Dashscope API key](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.i1) to fill in the `DASHSCOPE_API_KEY` field 
 in the `config.yaml` file. Change the `MODEL` field from `OpenAI` to `Qwen` as well.
 
+You can also try `Gemini` as another alternative multi-modal model to power the AppAgent. To use it, you should create an account on the respective platform and obtain the necessary API key to fill in the `GEMINI_API_KEY` field in the `config.yaml` file. Change the `MODEL` field to `Gemini` as well.
+
 If you want to test AppAgent using your own models, you should write a new model class in `scripts/model.py` accordingly.
 
 ### 🔍 Step 3. Exploration Phase

diff --git a/config.yaml b/config.yaml
@@ -1,15 +1,18 @@
-MODEL: "OpenAI"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
+MODEL: "Gemini"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI, Qwen or Gemini
 
 OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
 OPENAI_API_KEY: "sk-"  # Set the value to sk-xxx if you host the openai interface for open llm model
-OPENAI_API_MODEL: "gpt-4-vision-preview"  # The only OpenAI model by now that accepts visual input
+OPENAI_API_MODEL: "gpt-4o"  # The only OpenAI model by now that accepts visual input
 MAX_TOKENS: 300  # The max token limit for the response completion
 TEMPERATURE: 0.0  # The temperature of the model: the lower the value, the more consistent the output of the model
 REQUEST_INTERVAL: 10  # Time in seconds between consecutive GPT-4V requests
 
 DASHSCOPE_API_KEY: "sk-"  # The dashscope API key that gives you access to Qwen-VL model
 QWEN_MODEL: "qwen-vl-max"
 
+GEMINI_API_KEY: "AI"  # Your Gemini API key
+GEMINI_MODEL: "gemini-1.5-flash"  # The Gemini model to use
+
 ANDROID_SCREENSHOT_DIR: "/sdcard"  # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!
 ANDROID_XML_DIR: "/sdcard"  # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!
 

diff --git a/scripts/and_controller.py b/scripts/and_controller.py
@@ -175,6 +175,6 @@ def swipe(self, x, y, direction, dist="medium", quick=False):
     def swipe_precise(self, start, end, duration=400):
         start_x, start_y = start
         end_x, end_y = end
-        adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}"
+        adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_y} {end_x} {end_y} {duration}"
         ret = execute_adb(adb_command)
         return ret
diff --git a/scripts/document_generation.py b/scripts/document_generation.py
@@ -8,7 +8,7 @@
 
 import prompts
 from config import load_config
-from model import OpenAIModel, QwenModel
+from model import OpenAIModel, QwenModel, GeminiModel
 from utils import print_with_color
 
 arg_desc = "AppAgent - Human Demonstration"
@@ -29,6 +29,9 @@
 elif configs["MODEL"] == "Qwen":
     mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
                      model=configs["QWEN_MODEL"])
+elif configs["MODEL"] == "Gemini":
+    mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
+                       model=configs["GEMINI_MODEL"])
 else:
     print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
     sys.exit()

diff --git a/scripts/model.py b/scripts/model.py
@@ -1,6 +1,6 @@
 import re
 from abc import abstractmethod
-from typing import List
+from typing import List, Tuple
 from http import HTTPStatus
 
 import requests
@@ -14,7 +14,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         pass
 
 
@@ -27,7 +27,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float,
         self.temperature = temperature
         self.max_tokens = max_tokens
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         content = [
             {
                 "type": "text",
@@ -76,7 +76,7 @@ def __init__(self, api_key: str, model: str):
         self.model = model
         dashscope.api_key = api_key
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         content = [{
             "text": prompt
         }]
@@ -97,6 +97,32 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
         else:
             return False, response.message
 
+class GeminiModel(BaseModel):
+    def __init__(self, api_key: str, model: str):
+        super().__init__()
+        self.api_key = api_key
+        self.model = model
+
+    def get_model_response(self, prompt: str, images: List[str]) -> tuple[bool, str]:
+        # Implement Gemini API call here
+        # This is a placeholder; actual implementation depends on Gemini's API specifications
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "prompt": prompt,
+            "images": images  # Assuming Gemini supports image inputs
+        }
+        try:
+            # Replace with actual Gemini API endpoint
+            response = requests.post("https://api.gemini.com/v1/completions", headers=headers, json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            return True, response_data["choices"][0]["message"]["content"]
+        except Exception as e:
+            return False, str(e)
 
 def parse_explore_rsp(rsp):
     try:

diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py
@@ -10,7 +10,7 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel, GeminiModel
 from utils import print_with_color, draw_bbox_multi
 
 arg_desc = "AppAgent - Autonomous Exploration"
@@ -30,6 +30,9 @@
 elif configs["MODEL"] == "Qwen":
     mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
                      model=configs["QWEN_MODEL"])
+elif configs["MODEL"] == "Gemini":
+    mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
+                    model=configs["GEMINI_MODEL"])
 else:
     print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
     sys.exit()