Loading...
Loading...
Expert skill for Open-AutoGLM, an AI phone agent framework that controls Android/HarmonyOS/iOS devices via natural language using the AutoGLM vision-language model
npx skill4agent add aradotso/trending-skills open-autoglm-phone-agentSkill by ara.so — Daily 2026 Skills collection.
User Natural Language → AutoGLM VLM → Screen Perception → ADB/HDC/WebDriverAgent → Device Actionsgit clone https://github.com/zai-org/Open-AutoGLM.git
cd Open-AutoGLM
pip install -r requirements.txt
pip install -e .# Android
adb devices
# Expected: emulator-5554 device
# HarmonyOS NEXT
hdc list targets
# Expected: 7001005458323933328a01bce01c2500export BIGMODEL_API_KEY="your-bigmodel-api-key"
python main.py \
--base-url https://open.bigmodel.cn/api/paas/v4 \
--model "autoglm-phone" \
--apikey $BIGMODEL_API_KEY \
"打开美团搜索附近的火锅店"export MODELSCOPE_API_KEY="your-modelscope-api-key"
python main.py \
--base-url https://api-inference.modelscope.cn/v1 \
--model "ZhipuAI/AutoGLM-Phone-9B" \
--apikey $MODELSCOPE_API_KEY \
"open Meituan and find nearby hotpot"# Install vLLM (or use official Docker: docker pull vllm/vllm-openai:v0.12.0)
pip install vllm
# Start model server (strictly follow these parameters)
python3 -m vllm.entrypoints.openai.api_server \
--served-model-name autoglm-phone-9b \
--allowed-local-media-path / \
--mm-encoder-tp-mode data \
--mm_processor_cache_type shm \
--mm_processor_kwargs '{"max_pixels":5000000}' \
--max-model-len 25480 \
--chat-template-content-format string \
--limit-mm-per-prompt '{"image":10}' \
--model zai-org/AutoGLM-Phone-9B \
--port 8000# Install SGLang or use: docker pull lmsysorg/sglang:v0.5.6.post1
# Inside container: pip install nvidia-cudnn-cu12==9.16.0.29
python3 -m sglang.launch_server \
--model-path zai-org/AutoGLM-Phone-9B \
--served-model-name autoglm-phone-9b \
--context-length 25480 \
--mm-enable-dp-encoder \
--mm-process-config '{"image":{"max_pixels":5000000}}' \
--port 8000python scripts/check_deployment_cn.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b<think>...</think><answer>do(action="Launch", app="...")# Android device (default)
python main.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b \
"打开小红书搜索美食"
# HarmonyOS device
python main.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b \
--device-type hdc \
"打开设置查看WiFi"
# Multilingual model for English apps
python main.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b-multilingual \
"Open Instagram and search for travel photos"| Parameter | Description | Default |
|---|---|---|
| Model service endpoint | Required |
| Model name on server | Required |
| API key for third-party services | None |
| | |
| Specific device serial number | Auto-detect |
from phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
config = AgentConfig(
base_url="http://localhost:8000/v1",
model="autoglm-phone-9b",
device_type="adb", # or "hdc" for HarmonyOS
)
agent = PhoneAgent(config)
# Run a task
result = agent.run("打开淘宝搜索蓝牙耳机")
print(result)from phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
import os
config = AgentConfig(
base_url=os.environ["MODEL_BASE_URL"],
model=os.environ["MODEL_NAME"],
apikey=os.environ.get("MODEL_API_KEY"),
device_type="adb",
device_id="emulator-5554", # specific device
)
agent = PhoneAgent(config)
# Task with sensitive operation confirmation
result = agent.run(
"在京东购买最便宜的蓝牙耳机",
confirm_sensitive=True # prompt user before purchase actions
)import openai
import base64
import os
from pathlib import Path
client = openai.OpenAI(
base_url=os.environ["MODEL_BASE_URL"],
api_key=os.environ.get("MODEL_API_KEY", "dummy"),
)
# Load screenshot
screenshot_path = "screenshot.png"
with open(screenshot_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="autoglm-phone-9b",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
},
{
"type": "text",
"text": "Task: 搜索附近的咖啡店\nCurrent step: Navigate to search",
},
],
}
],
)
print(response.choices[0].message.content)
# Output format: <think>...</think>\n<answer>do(action="...", ...)import re
def parse_action(model_output: str) -> dict:
"""Parse AutoGLM model output into structured action."""
# Extract answer block
answer_match = re.search(r'<answer>(.*?)(?:</answer>|$)', model_output, re.DOTALL)
if not answer_match:
return {"action": "unknown"}
answer = answer_match.group(1).strip()
# Parse do() call
# Format: do(action="ActionName", param1="value1", param2="value2")
action_match = re.search(r'do\(action="([^"]+)"(.*?)\)', answer, re.DOTALL)
if not action_match:
return {"action": "unknown", "raw": answer}
action_name = action_match.group(1)
params_str = action_match.group(2)
# Parse parameters
params = {}
for param_match in re.finditer(r'(\w+)="([^"]*)"', params_str):
params[param_match.group(1)] = param_match.group(2)
return {"action": action_name, **params}
# Example usage
output = '<think>需要启动京东</think>\n<answer>do(action="Launch", app="京东")'
action = parse_action(output)
# {"action": "Launch", "app": "京东"}import subprocess
def take_screenshot(device_id: str = None) -> bytes:
"""Capture current device screen."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["exec-out", "screencap", "-p"])
result = subprocess.run(cmd, capture_output=True)
return result.stdout
def send_tap(x: int, y: int, device_id: str = None):
"""Tap at screen coordinates."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "tap", str(x), str(y)])
subprocess.run(cmd)
def send_text_adb_keyboard(text: str, device_id: str = None):
"""Send text via ADB Keyboard (must be installed and enabled)."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
# Enable ADB keyboard first
cmd_enable = cmd + ["shell", "ime", "set", "com.android.adbkeyboard/.AdbIME"]
subprocess.run(cmd_enable)
# Send text
cmd_text = cmd + ["shell", "am", "broadcast", "-a", "ADB_INPUT_TEXT",
"--es", "msg", text]
subprocess.run(cmd_text)
def swipe(x1: int, y1: int, x2: int, y2: int, duration_ms: int = 300, device_id: str = None):
"""Swipe gesture on screen."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "swipe",
str(x1), str(y1), str(x2), str(y2), str(duration_ms)])
subprocess.run(cmd)
def press_back(device_id: str = None):
"""Press Android back button."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "keyevent", "KEYCODE_BACK"])
subprocess.run(cmd)
def launch_app(package_name: str, device_id: str = None):
"""Launch app by package name."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "monkey", "-p", package_name, "-c",
"android.intent.category.LAUNCHER", "1"])
subprocess.run(cmd)// .env configuration
// MIDSCENE_MODEL_NAME=autoglm-phone
// MIDSCENE_OPENAI_BASE_URL=https://open.bigmodel.cn/api/paas/v4
// MIDSCENE_OPENAI_API_KEY=your-api-key
import { AndroidAgent } from "@midscene/android";
const agent = new AndroidAgent();
await agent.aiAction("打开微信发送消息给张三");
await agent.aiQuery("当前页面显示的消息内容是什么?");# Connect device via USB first, then enable TCP/IP mode
adb tcpip 5555
# Get device IP address
adb shell ip addr show wlan0
# Connect wirelessly (disconnect USB after this)
adb connect 192.168.1.100:5555
# Verify connection
adb devices
# 192.168.1.100:5555 device
# Use with agent
python main.py \
--base-url http://model-server:8000/v1 \
--model autoglm-phone-9b \
--device-id "192.168.1.100:5555" \
"打开支付宝查看余额"| Action | Description | Example |
|---|---|---|
| Open an app | |
| Tap screen element | |
| Input text | |
| Scroll/swipe | |
| Press back button | |
| Go to home screen | |
| Task complete | |
| Model | Use Case | Languages |
|---|---|---|
| Chinese apps (WeChat, Taobao, Meituan) | Chinese-optimized |
| International apps, mixed content | Chinese + English + others |
zai-org/AutoGLM-Phone-9Bzai-org/AutoGLM-Phone-9B-MultilingualZhipuAI/AutoGLM-Phone-9BZhipuAI/AutoGLM-Phone-9B-Multilingual# Model service
export MODEL_BASE_URL="http://localhost:8000/v1"
export MODEL_NAME="autoglm-phone-9b"
export MODEL_API_KEY="" # Required for BigModel/ModelScope APIs
# BigModel API
export BIGMODEL_API_KEY=""
export BIGMODEL_BASE_URL="https://open.bigmodel.cn/api/paas/v4"
# ModelScope API
export MODELSCOPE_API_KEY=""
export MODELSCOPE_BASE_URL="https://api-inference.modelscope.cn/v1"
# Device configuration
export ADB_DEVICE_ID="" # Leave empty for auto-detect
export HDC_DEVICE_ID="" # HarmonyOS device ID--chat-template-content-format string--mm-process-configmax_pixels:5000000adb devicesadb kill-server && adb start-serveradb shell ime enable com.android.adbkeyboard/.AdbIME
adb shell ime set com.android.adbkeyboard/.AdbIMEconfirm_sensitive=True--tensor-parallel-size 2# Test connectivity
curl http://YOUR_SERVER_IP:8000/v1/models
# Should return model list JSON# After configuring WebDriverAgent per docs/ios_setup/ios_setup.md
python main.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b-multilingual \
--device-type ios \
"Open Maps and navigate to Central Park"