Loading...
Loading...
Terminal-Bench integration for Mux agent benchmarking and failure analysis
npx skill4agent add coder/mux tbench# Run full benchmark suite
make benchmark-terminal
# Run specific tasks
make benchmark-terminal TB_TASK_NAMES="hello-world chess-best-move"
# Run with specific model
make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-5"
# Run on Daytona cloud (high parallelism)
TB_ENV=daytona TB_CONCURRENCY=48 make benchmark-terminal# Set API key (get from https://app.daytona.io)
export DAYTONA_API_KEY="your-api-key"
# Run with 48 concurrent cloud sandboxes (~6x faster than local)
make benchmark-terminal TB_ENV=daytona TB_CONCURRENCY=48
# Run specific tasks on Daytona
make benchmark-terminal TB_ENV=daytona TB_CONCURRENCY=48 TB_TASK_NAMES="chess-best-move stockfish-elo"| Environment | Concurrency | Full suite time |
|---|---|---|
| Local Docker | 4 | ~90 min |
| Daytona Cloud | 48 | ~10-15 min |
TB_DATASETterminal-bench@2.0TB_CONCURRENCYTB_TIMEOUTTB_ENVlocaldaytonaTB_TASK_NAMESTB_ARGSMUX_RUN_ARGSmux run--thinking high --use-1m --budget 5.00mux runblind-maze-explorer-algorithm.hard# Run with 60 minute timeout for very complex tasks
TB_TIMEOUT=3600 make benchmark-terminal
# Run with shorter 10 minute timeout for quick iteration
TB_TIMEOUT=600 make benchmark-terminal TB_SAMPLE_SIZE=5TB_TIMEOUT--agent-kwargmodel_nameanthropic/claude-sonnet-4-5openai/gpt-5-codexexperimentsprogrammatic-tool-callingmux runMUX_RUN_ARGS# Run with model, thinking, and 1M context
gh workflow run terminal-bench.yml \
-f model_name=anthropic/claude-opus-4-6 \
-f mux_run_args="--thinking xhigh --use-1m"
# Run with budget cap
gh workflow run terminal-bench.yml \
-f model_name=anthropic/claude-opus-4-6 \
-f mux_run_args="--thinking high --budget 5.00"# Pass flags via MUX_RUN_ARGS env var
MUX_RUN_ARGS="--thinking high --use-1m" make benchmark-terminal
# Model and experiments via TB_ARGS
make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5-codex --agent-kwarg experiments=programmatic-tool-calling"runs/YYYY-MM-DD__HH-MM-SS/results.jsonrun_metadata.json<task-id>/sessions/agent.logsessions/agent.castsessions/tests.logresults.jsonbqgcloud auth loginmux-benchmarksmux-benchmarks.benchmarks.tbench_resultsrun_idtask_idmodel_namethinking_levelmodedatasetexperimentspassedscoren_input_tokensn_output_tokensgithub_run_idgithub_shaingested_at.github/workflows/terminal-bench.yml.github/workflows/nightly-terminal-bench.yml# Download latest 5 successful nightly runs (recommended for submission)
python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5
# Use specific run IDs (each becomes a separate job folder)
python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --run-id 111 222 333 444 555
# Use multiple existing artifact directories
python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --artifacts-dir ./run1 ./run2
# Download latest single run (quick iteration)
python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py
# Only prepare specific models
python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-5leaderboard_submission/submissions/terminal-bench/2.0/Mux__<model>/
metadata.yaml # Agent and model info
<job-folder-1>/ # Results from run 1
config.json
result.json
<trial-1>/
config.json
result.json
agent/
verifier/
...
<job-folder-2>/ # Results from run 2
...hf upload# Install huggingface_hub (via uv or pip)
pip install huggingface_hub
# Authenticate (one-time setup)
hf auth loginimport httpx
from huggingface_hub import HfApi
from huggingface_hub.utils import configure_http_backend
configure_http_backend(
backend_factory=lambda: httpx.Client(timeout=httpx.Timeout(300.0, connect=60.0))
)
api = HfApi()
api.upload_folder(
repo_id="alexgshaw/terminal-bench-2-leaderboard",
folder_path="./leaderboard_submission/submissions",
path_in_repo="submissions",
repo_type="dataset",
create_pr=True,
commit_message="Add Mux + <Model> submission",
commit_description="- Agent: Mux (Coder)\n- Model: <model>\n- <N> tasks × <K> attempts",
)*.log--artifacts-dirrevision="refs/pr/<N>"create_pr=Trueapi.delete_folder(..., revision="refs/pr/<N>")mux_agent.pyBaseInstalledAgentmux-run.shmux_payload.pymux_setup.sh.j2prepare_leaderboard_submission.pyanalyze_failure_rates.pydownload_run_logs.py# Find tasks where Mux underperforms (high M/O ratio = Mux fails more than others)
python benchmarks/terminal_bench/analyze_failure_rates.py --top 20# Authenticate and set project
gcloud auth login && gcloud config set project mux-benchmarks
# Query pass/fail by model for specific task (strip __hash suffix mentally)
bq query --use_legacy_sql=false '
SELECT model_name, passed, COUNT(*) as runs
FROM `mux-benchmarks.benchmarks.tbench_results`
WHERE REGEXP_REPLACE(task_id, r"__[a-zA-Z0-9]+$", "") = "TASK_NAME_HERE"
AND github_workflow = "Nightly Terminal-Bench"
AND passed IS NOT NULL
GROUP BY model_name, passed
ORDER BY model_name, passed
'# List recent nightly runs
python benchmarks/terminal_bench/download_run_logs.py --list-runs
# Download latest run and filter to failing task
python benchmarks/terminal_bench/download_run_logs.py --task TASK_NAME --failures-only
# Download specific run, filter to specific model
python benchmarks/terminal_bench/download_run_logs.py --run-id 21230456195 --model opus --task TASK_NAME
# Verbose mode shows stderr from agent execution
python benchmarks/terminal_bench/download_run_logs.py --task TASK_NAME -v.run_logs/<run-id>/agent/command-0/stdout.txtagent/command-0/stderr.txtresult.jsonverifier_resultexception_info# Clone leaderboard repo from HuggingFace (cached in .leaderboard_cache/)
cd benchmarks/terminal_bench
git clone https://huggingface.co/datasets/alexgshaw/terminal-bench-2-leaderboard .leaderboard_cache/terminal-bench-2-leaderboard 2>/dev/null
# Find passing submissions for the task
find .leaderboard_cache -path "*TASK_NAME*" -name "result.json" -exec sh -c '
agent=$(echo "$1" | cut -d/ -f5)
reward=$(cat "$1" | python3 -c "import json,sys; print(json.load(sys.stdin).get(\"verifier_result\",{}).get(\"rewards\",{}).get(\"reward\",0))")
echo "$agent: reward=$reward"
' _ {} \;# Run analysis (requires bq CLI for Mux results, git for leaderboard data)
python benchmarks/terminal_bench/analyze_failure_rates.py
# Show more results
python benchmarks/terminal_bench/analyze_failure_rates.py --top 50
# Filter to specific Mux model
python benchmarks/terminal_bench/analyze_failure_rates.py --mux-model sonnet
# Force refresh of cached data
python benchmarks/terminal_bench/analyze_failure_rates.py --refresh
# Output as JSON for further processing
python benchmarks/terminal_bench/analyze_failure_rates.py --json > opportunities.jsonM/O ratio = Mux failure rate / Average failure rate of top 10 agents================================================================================
OPTIMIZATION OPPORTUNITIES (sorted by M/O ratio)
================================================================================
Task ID Mux Fail% Avg Other% M/O Ratio Agent
--------------------------------------------------------------------------------
some-difficult-task 100.0% 10.0% 9.09 Mux__Claude-Sonnet-4.5
another-task 80.0% 20.0% 3.64 Mux__Claude-Sonnet-4.5
...
================================================================================
SUMMARY
================================================================================
Total tasks with Mux failures: 42
High priority (M/O > 2.0): 12
Medium priority (1.0 < M/O ≤ 2.0): 8