Loading...
Loading...
Create a new Harbor task for evaluating agents. Use when the user wants to scaffold, build, or design a new task, benchmark problem, or eval. Guides through instruction writing, environment setup, verifier design (pytest vs Reward Kit vs custom), and solution scripting.
npx skill4agent add harbor-framework/harbor create-taskharbor task init "<org>/<task-name>"--description "..."--author "Jane Doe <jane@example.com>"--no-pytest--no-solution--metadata-template path.toml<task-name>/
├── instruction.md # Task prompt for the agent
├── task.toml # Config and metadata
├── environment/Dockerfile # Container definition
├── solution/solve.sh # Reference solution (optional)
└── tests/test.sh # Verifier scriptsteps/# SSH Key Pair Generation
Generate an SSH key pair in the files `~/.ssh/id_rsa` and `~/.ssh/id_rsa.pub`.
Don't make them password protected.environment/DockerfileFROM ubuntu:24.04
WORKDIR /app
# Install what the task requires — NOT the solution
RUN apt-get update && apt-get install -y openssh-client && rm -rf /var/lib/apt/lists/*environment/docker-compose.yamlharbor task start-env -p "<task-path>" -e docker -a -irewardkitfile_containscommand_succeedsjson_key_equalstests/test.sh#!/bin/bash
uvx --from 'harbor-rewardkit==0.1.*' rewardkit /testsharbor-rewardkitrewardkit--from 'harbor-rewardkit==0.1.*' rewardkituvx harbor-rewardkittests/checks.pytests/judge.tomlrewardkit--no-pytesttests/test.sh#!/bin/bash
apt-get update && apt-get install -y curl
curl -LsSf https://astral.sh/uv/0.9.7/install.sh | sh
source $HOME/.local/bin/env
uvx --with pytest==8.4.1 pytest /tests/test_outputs.py
if [ $? -eq 0 ]; then
echo 1 > /logs/verifier/reward.txt
else
echo 0 > /logs/verifier/reward.txt
fitests/test_outputs.pyfrom pathlib import Path
def test_file_exists():
assert (Path.home() / ".ssh" / "id_rsa").exists()#!/bin/bash
if diff -q /app/output.txt /tests/expected.txt; then
echo 1 > /logs/verifier/reward.txt
else
echo 0 > /logs/verifier/reward.txt
fi/logs/verifier/reward.txt01/logs/verifier/reward.json{"accuracy": 0.95, "runtime_sec": 1.2}test.shsolution/solve.sh#!/bin/bash
ssh-keygen -t rsa -f ~/.ssh/id_rsa -N ""chmod +x solution/solve.sh[task]
name = "<org>/<task-name>"
description = "One-line description"
keywords = ["jax", "mnist", "rewardkit"] # always populate — used for search/filtering
[metadata]
difficulty = "easy" | "medium" | "hard"
category = "programming" | "machine-learning" | "gpu" | ...
tags = ["..."]
[environment]
cpus = 1 # CPU cores
memory_mb = 2048 # RAM in MB
storage_mb = 10240 # Disk in MB
allow_internet = true # Network access
[agent]
timeout_sec = 120.0 # How long the agent has
[verifier]
timeout_sec = 600.0 # How long tests havekeywordsrewardkitjudge-gradingpytestgpuharbor datasets list[verifier.env]
ANTHROPIC_API_KEY = "${ANTHROPIC_API_KEY}"harbor run -p "<task-path>" -a oraclesolution/solve.sh1.0solve.shstart-env -a -i/logs/verifier/harbor run -p "<task-path>" -a terminus-2 -m anthropic/claude-sonnet-4-6harbor task initREADME.mdinstruction.mdharbor runinstruction.mdtests/solution/steps/<task-name>/
├── task.toml
├── environment/Dockerfile # Built once, shared across all steps
├── steps/
│ ├── scaffold/
│ │ ├── instruction.md # Prompt for this step
│ │ ├── workdir/ # Uploaded to WORKDIR before the agent runs
│ │ │ └── setup.sh # Optional pre-agent hook (reserved filename)
│ │ ├── tests/test.sh # Per-step verifier
│ │ └── solution/solve.sh # Per-step Oracle solution (optional)
│ ├── implement/
│ │ └── ...
│ └── document/
│ └── ...
└── tests/ # Optional shared helpers + fallback test.shtests//teststests/steps/{name}/workdir/setup.shworkdir/rm -- "$0"schema_version = "1.1"
[task]
name = "<org>/<task-name>"
# How per-step rewards roll up into the trial-level verifier_result.
# "mean" (default): per-key mean across steps that produced a result.
# "final": the last step's verifier_result verbatim.
multi_step_reward_strategy = "mean"
[[steps]]
name = "scaffold" # Must match the directory under steps/
min_reward = 1.0 # Abort trial if this step's reward < 1.0
[steps.agent]
timeout_sec = 60.0 # Overrides task-level [agent].timeout_sec
[steps.verifier]
timeout_sec = 30.0
[[steps]]
name = "implement"
# Dict form gates on specific keys from a multi-dim reward:
min_reward = { correctness = 0.8, style = 0.5 }
[steps.agent]
timeout_sec = 120.0
[steps.verifier]
timeout_sec = 30.0
[[steps]]
name = "document"
[steps.agent]
timeout_sec = 60.0
[steps.verifier]
timeout_sec = 30.0agent.timeout_secagent.userverifier.timeout_secverifier.envverifier.userhealthcheck.*artifacts"mean""final"min_reward"final"artifactssteps/{name}/artifacts/harbor run -p "<task-path>" -a oraclesolution/solve.sh1.0docs/content/docs/tasks/multi-step.mdxexamples/tasks/hello-multi-step-advanced/[[environment.mcp_servers]][environment.healthcheck]environment.gpusenvironment.gpu_typesenvironment.docker_imageagent.userverifier.usertest.shinstruction.mdchmod +x solution/solve.shkeywords = []README.md