Loading...
Loading...
Compare original and translation side by side
Skill by ara.so — Daily 2026 Skills collection.
由ara.so开发的技能 — 属于Daily 2026技能合集。
User Natural Language → AutoGLM VLM → Screen Perception → ADB/HDC/WebDriverAgent → Device Actions用户自然语言 → AutoGLM多模态大模型 → 屏幕感知 → ADB/HDC/WebDriverAgent → 设备操作git clone https://github.com/zai-org/Open-AutoGLM.git
cd Open-AutoGLM
pip install -r requirements.txt
pip install -e .git clone https://github.com/zai-org/Open-AutoGLM.git
cd Open-AutoGLM
pip install -r requirements.txt
pip install -e .undefinedundefinedundefinedundefinedexport BIGMODEL_API_KEY="your-bigmodel-api-key"
python main.py \
--base-url https://open.bigmodel.cn/api/paas/v4 \
--model "autoglm-phone" \
--apikey $BIGMODEL_API_KEY \
"打开美团搜索附近的火锅店"export MODELSCOPE_API_KEY="your-modelscope-api-key"
python main.py \
--base-url https://api-inference.modelscope.cn/v1 \
--model "ZhipuAI/AutoGLM-Phone-9B" \
--apikey $MODELSCOPE_API_KEY \
"open Meituan and find nearby hotpot"export BIGMODEL_API_KEY="你的BigModel API密钥"
python main.py \
--base-url https://open.bigmodel.cn/api/paas/v4 \
--model "autoglm-phone" \
--apikey $BIGMODEL_API_KEY \
"打开美团搜索附近的火锅店"export MODELSCOPE_API_KEY="你的ModelScope API密钥"
python main.py \
--base-url https://api-inference.modelscope.cn/v1 \
--model "ZhipuAI/AutoGLM-Phone-9B" \
--apikey $MODELSCOPE_API_KEY \
"open Meituan and find nearby hotpot"undefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedpython scripts/check_deployment_cn.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b<think>...</think><answer>do(action="Launch", app="...")python scripts/check_deployment_cn.py \
--base-url http://localhost:8000/v1 \
--model autoglm-phone-9b<think>...</think><answer>do(action="Launch", app="...")undefinedundefinedundefinedundefined| Parameter | Description | Default |
|---|---|---|
| Model service endpoint | Required |
| Model name on server | Required |
| API key for third-party services | None |
| | |
| Specific device serial number | Auto-detect |
| 参数 | 描述 | 默认值 |
|---|---|---|
| 模型服务端点 | 必填 |
| 服务端的模型名称 | 必填 |
| 第三方服务的API密钥 | 无 |
| 设备类型: | |
| 特定设备序列号 | 自动检测 |
from phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
config = AgentConfig(
base_url="http://localhost:8000/v1",
model="autoglm-phone-9b",
device_type="adb", # or "hdc" for HarmonyOS
)
agent = PhoneAgent(config)from phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
config = AgentConfig(
base_url="http://localhost:8000/v1",
model="autoglm-phone-9b",
device_type="adb", # HarmonyOS设备请改为"hdc"
)
agent = PhoneAgent(config)undefinedundefinedfrom phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
import os
config = AgentConfig(
base_url=os.environ["MODEL_BASE_URL"],
model=os.environ["MODEL_NAME"],
apikey=os.environ.get("MODEL_API_KEY"),
device_type="adb",
device_id="emulator-5554", # specific device
)
agent = PhoneAgent(config)from phone_agent import PhoneAgent
from phone_agent.config import AgentConfig
import os
config = AgentConfig(
base_url=os.environ["MODEL_BASE_URL"],
model=os.environ["MODEL_NAME"],
apikey=os.environ.get("MODEL_API_KEY"),
device_type="adb",
device_id="emulator-5554", # 指定设备
)
agent = PhoneAgent(config)undefinedundefinedimport openai
import base64
import os
from pathlib import Path
client = openai.OpenAI(
base_url=os.environ["MODEL_BASE_URL"],
api_key=os.environ.get("MODEL_API_KEY", "dummy"),
)import openai
import base64
import os
from pathlib import Path
client = openai.OpenAI(
base_url=os.environ["MODEL_BASE_URL"],
api_key=os.environ.get("MODEL_API_KEY", "dummy"),
)undefinedundefinedimport re
def parse_action(model_output: str) -> dict:
"""Parse AutoGLM model output into structured action."""
# Extract answer block
answer_match = re.search(r'<answer>(.*?)(?:</answer>|$)', model_output, re.DOTALL)
if not answer_match:
return {"action": "unknown"}
answer = answer_match.group(1).strip()
# Parse do() call
# Format: do(action="ActionName", param1="value1", param2="value2")
action_match = re.search(r'do\(action="([^"]+)"(.*?)\)', answer, re.DOTALL)
if not action_match:
return {"action": "unknown", "raw": answer}
action_name = action_match.group(1)
params_str = action_match.group(2)
# Parse parameters
params = {}
for param_match in re.finditer(r'(\w+)="([^"]*)"', params_str):
params[param_match.group(1)] = param_match.group(2)
return {"action": action_name, **params}import re
def parse_action(model_output: str) -> dict:
"""将AutoGLM模型输出解析为结构化操作指令。"""
# 提取answer块
answer_match = re.search(r'<answer>(.*?)(?:</answer>|$)', model_output, re.DOTALL)
if not answer_match:
return {"action": "unknown"}
answer = answer_match.group(1).strip()
# 解析do()调用
# 格式:do(action="ActionName", param1="value1", param2="value2")
action_match = re.search(r'do\(action="([^"]+)"(.*?)\)', answer, re.DOTALL)
if not action_match:
return {"action": "unknown", "raw": answer}
action_name = action_match.group(1)
params_str = action_match.group(2)
# 解析参数
params = {}
for param_match in re.finditer(r'(\w+)="([^"]*)"', params_str):
params[param_match.group(1)] = param_match.group(2)
return {"action": action_name, **params}undefinedundefinedimport subprocess
def take_screenshot(device_id: str = None) -> bytes:
"""Capture current device screen."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["exec-out", "screencap", "-p"])
result = subprocess.run(cmd, capture_output=True)
return result.stdout
def send_tap(x: int, y: int, device_id: str = None):
"""Tap at screen coordinates."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "tap", str(x), str(y)])
subprocess.run(cmd)
def send_text_adb_keyboard(text: str, device_id: str = None):
"""Send text via ADB Keyboard (must be installed and enabled)."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
# Enable ADB keyboard first
cmd_enable = cmd + ["shell", "ime", "set", "com.android.adbkeyboard/.AdbIME"]
subprocess.run(cmd_enable)
# Send text
cmd_text = cmd + ["shell", "am", "broadcast", "-a", "ADB_INPUT_TEXT",
"--es", "msg", text]
subprocess.run(cmd_text)
def swipe(x1: int, y1: int, x2: int, y2: int, duration_ms: int = 300, device_id: str = None):
"""Swipe gesture on screen."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "swipe",
str(x1), str(y1), str(x2), str(y2), str(duration_ms)])
subprocess.run(cmd)
def press_back(device_id: str = None):
"""Press Android back button."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "keyevent", "KEYCODE_BACK"])
subprocess.run(cmd)
def launch_app(package_name: str, device_id: str = None):
"""Launch app by package name."""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "monkey", "-p", package_name, "-c",
"android.intent.category.LAUNCHER", "1"])
subprocess.run(cmd)import subprocess
def take_screenshot(device_id: str = None) -> bytes:
"""捕获设备当前屏幕。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["exec-out", "screencap", "-p"])
result = subprocess.run(cmd, capture_output=True)
return result.stdout
def send_tap(x: int, y: int, device_id: str = None):
"""点击屏幕指定坐标。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "tap", str(x), str(y)])
subprocess.run(cmd)
def send_text_adb_keyboard(text: str, device_id: str = None):
"""通过ADB Keyboard发送文本(需已安装并启用)。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
# 先启用ADB Keyboard
cmd_enable = cmd + ["shell", "ime", "set", "com.android.adbkeyboard/.AdbIME"]
subprocess.run(cmd_enable)
# 发送文本
cmd_text = cmd + ["shell", "am", "broadcast", "-a", "ADB_INPUT_TEXT",
"--es", "msg", text]
subprocess.run(cmd_text)
def swipe(x1: int, y1: int, x2: int, y2: int, duration_ms: int = 300, device_id: str = None):
"""屏幕滑动操作。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "swipe",
str(x1), str(y1), str(x2), str(y2), str(duration_ms)])
subprocess.run(cmd)
def press_back(device_id: str = None):
"""按下Android返回键。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "input", "keyevent", "KEYCODE_BACK"])
subprocess.run(cmd)
def launch_app(package_name: str, device_id: str = None):
"""通过包名启动应用。"""
cmd = ["adb"]
if device_id:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "monkey", "-p", package_name, "-c",
"android.intent.category.LAUNCHER", "1"])
subprocess.run(cmd)// .env configuration
// MIDSCENE_MODEL_NAME=autoglm-phone
// MIDSCENE_OPENAI_BASE_URL=https://open.bigmodel.cn/api/paas/v4
// MIDSCENE_OPENAI_API_KEY=your-api-key
import { AndroidAgent } from "@midscene/android";
const agent = new AndroidAgent();
await agent.aiAction("打开微信发送消息给张三");
await agent.aiQuery("当前页面显示的消息内容是什么?");// .env配置
// MIDSCENE_MODEL_NAME=autoglm-phone
// MIDSCENE_OPENAI_BASE_URL=https://open.bigmodel.cn/api/paas/v4
// MIDSCENE_OPENAI_API_KEY=your-api-key
import { AndroidAgent } from "@midscene/android";
const agent = new AndroidAgent();
await agent.aiAction("打开微信发送消息给张三");
await agent.aiQuery("当前页面显示的消息内容是什么?");undefinedundefinedundefinedundefined| Action | Description | Example |
|---|---|---|
| Open an app | |
| Tap screen element | |
| Input text | |
| Scroll/swipe | |
| Press back button | |
| Go to home screen | |
| Task complete | |
| 操作 | 描述 | 示例 |
|---|---|---|
| 打开应用 | |
| 点击屏幕元素 | |
| 输入文本 | |
| 滑动屏幕 | |
| 按下返回键 | |
| 返回主屏幕 | |
| 任务完成 | |
| Model | Use Case | Languages |
|---|---|---|
| Chinese apps (WeChat, Taobao, Meituan) | Chinese-optimized |
| International apps, mixed content | Chinese + English + others |
zai-org/AutoGLM-Phone-9Bzai-org/AutoGLM-Phone-9B-MultilingualZhipuAI/AutoGLM-Phone-9BZhipuAI/AutoGLM-Phone-9B-Multilingual| 模型 | 使用场景 | 语言支持 |
|---|---|---|
| 中文应用(微信、淘宝、美团等) | 中文优化 |
| 国际应用、多语言内容 | 中文+英文+其他语言 |
zai-org/AutoGLM-Phone-9Bzai-org/AutoGLM-Phone-9B-MultilingualZhipuAI/AutoGLM-Phone-9BZhipuAI/AutoGLM-Phone-9B-Multilingualundefinedundefinedundefinedundefined--chat-template-content-format string--mm-process-configmax_pixels:5000000--chat-template-content-format stringmax_pixels:5000000--mm-process-configadb devicesadb devicesadb kill-server && adb start-serveradb kill-server && adb start-serveradb shell ime enable com.android.adbkeyboard/.AdbIME
adb shell ime set com.android.adbkeyboard/.AdbIMEadb shell ime enable com.android.adbkeyboard/.AdbIME
adb shell ime set com.android.adbkeyboard/.AdbIMEconfirm_sensitive=Trueconfirm_sensitive=True--tensor-parallel-size 2--tensor-parallel-size 2undefinedundefinedundefinedundefinedundefinedundefinedundefinedundefined