Loading...
Loading...
Run Microsoft's eval-recipes benchmarks to validate amplihack improvements against baseline agents. Auto-activates when testing improvements, running evals, or benchmarking changes.
npx skill4agent add rysweet/amplihack eval-recipes-runner# Clone eval-recipes from Microsoft
git clone https://github.com/microsoft/eval-recipes.git ~/eval-recipes
cd ~/eval-recipes
# Copy our agent configs
cp -r $(pwd)/.claude/agents/eval-recipes/* data/agents/
# Install dependencies
uv sync# Update install.dockerfile to use specific branch
# Then run benchmark
cd ~/eval-recipes
uv run eval_recipes/main.py --agent amplihack --task linkedin_drafting --trials 3# Test baseline (main)
uv run eval_recipes/main.py --agent amplihack --task linkedin_drafting
# Test PR branch (edit install.dockerfile to checkout PR branch)
uv run eval_recipes/main.py --agent amplihack_pr1443 --task linkedin_drafting
# Compare scoreslinkedin_draftingemail_draftingarxiv_paper_summarizergithub_docs_extractor~/eval-recipes/data/tasks/# In .claude/agents/eval-recipes/amplihack/install.dockerfile
RUN git clone https://github.com/rysweet/...git /tmp/amplihack && \
cd /tmp/amplihack && \
git checkout BRANCH_NAME && \
pip install -e .cp -r .claude/agents/eval-recipes/* ~/eval-recipes/data/agents/cd ~/eval-recipes
uv run eval_recipes/main.py --agent amplihack --task TASK_NAME --trials 3feat/issue-1435-task-classificationcp -r .claude/agents/eval-recipes/* ~/eval-recipes/data/agents/cd ~/eval-recipes && uv run eval_recipes/main.py --agent amplihack --task linkedin_drafting --trials 3~/eval-recipesexport ANTHROPIC_API_KEY=sk-ant-...curl -LsSf https://astral.sh/uv/install.sh | sh.benchmark_results/# Test suite for a PR
tasks="linkedin_drafting email_drafting arxiv_paper_summarizer"
for task in $tasks; do
uv run eval_recipes/main.py --agent amplihack --task $task --trials 3
done
# Compare results
cat .benchmark_results/*/amplihack/*/score.txt