Loading...
Loading...
Measure and improve how well your AI works. Use when AI gives wrong answers, accuracy is bad, responses are unreliable, you need to test AI quality, evaluate your AI, write metrics, benchmark performance, optimize prompts, improve results, or systematically make your AI better. Covers DSPy evaluation, metrics, and optimization.
npx skill4agent add lebsral/dspy-programming-not-prompting-lms-skills ai-improving-accuracydef metric(example, prediction, trace=None):
return prediction.answer == example.answerdef metric(example, prediction, trace=None):
return prediction.answer.strip().lower() == example.answer.strip().lower()def metric(example, prediction, trace=None):
fields = ["name", "email", "phone"]
correct = sum(
1 for f in fields
if getattr(prediction, f, "").lower() == getattr(example, f, "").lower()
)
return correct / len(fields)def metric(example, prediction, trace=None):
gold_tokens = set(example.answer.lower().split())
pred_tokens = set(prediction.answer.lower().split())
if not gold_tokens or not pred_tokens:
return float(gold_tokens == pred_tokens)
precision = len(gold_tokens & pred_tokens) / len(pred_tokens)
recall = len(gold_tokens & pred_tokens) / len(gold_tokens)
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)class AssessQuality(dspy.Signature):
"""Assess if the predicted answer is correct and complete."""
question: str = dspy.InputField()
gold_answer: str = dspy.InputField()
predicted_answer: str = dspy.InputField()
is_correct: bool = dspy.OutputField()
def metric(example, prediction, trace=None):
judge = dspy.Predict(AssessQuality)
result = judge(
question=example.question,
gold_answer=example.answer,
predicted_answer=prediction.answer,
)
return result.is_correctdef metric(example, prediction, trace=None):
correct = float(prediction.answer.lower() == example.answer.lower())
concise = float(len(prediction.answer.split()) < 50)
has_reasoning = float(len(getattr(prediction, 'reasoning', '')) > 20)
return 0.7 * correct + 0.2 * concise + 0.1 * has_reasoningtraceNonedef metric(example, prediction, trace=None):
correct = prediction.answer == example.answer
if trace is not None:
# During optimization, also require good reasoning
has_reasoning = len(prediction.reasoning) > 50
return correct and has_reasoning
return correct/ai-generating-dataimport dspy
# Manual creation
devset = [
dspy.Example(question="What is DSPy?", answer="A framework for LM programs").with_inputs("question"),
# 20-100+ examples for reliable evaluation
]
# From CSV/JSON
import json
with open("test_data.json") as f:
data = json.load(f)
devset = [dspy.Example(**x).with_inputs("question") for x in data]
# From HuggingFace
from datasets import load_dataset
dataset = load_dataset("squad", split="validation[:100]")
devset = [
dspy.Example(question=x["question"], answer=x["answers"]["text"][0]).with_inputs("question")
for x in dataset
]from dspy.evaluate import Evaluate
evaluator = Evaluate(
devset=devset,
metric=metric,
num_threads=4,
display_progress=True,
display_table=5, # show 5 example results
)
baseline_score = evaluator(my_program)
print(f"Baseline: {baseline_score}")| Training examples | Recommended optimizer | Expected improvement | Typical cost |
|---|---|---|---|
| <20 | GEPA (instruction tuning) | 5-15% | ~$0.50 |
| 20-50 | BootstrapFewShot | 5-20% | ~$0.50-2 |
| 50-200 | BootstrapFewShot, then MIPROv2 | 15-35% | ~$2-10 |
| 200-500 | MIPROv2 (auto="medium") | 20-40% | ~$5-15 |
| 500+ | MIPROv2 (auto="heavy") or BootstrapFinetune | 25-50% | ~$15-50+ |
Start here
|
+- Just getting started (<50 examples)? -> BootstrapFewShot
| Quick, cheap, usually gives a solid boost.
|
+- Want better prompts (50+ examples)? -> MIPROv2
| Optimizes both instructions and examples.
| Best general-purpose prompt optimizer.
|
+- Want to tune instructions only (<50 examples)? -> GEPA
| Good when you have few examples.
|
+- Need maximum quality (500+ examples)? -> BootstrapFinetune
| Fine-tunes the model weights.
| Best for production with smaller/cheaper models.
|
+- Want to combine approaches? -> BetterTogether
Jointly optimizes prompts and weights./ai-switching-modelsoptimizer = dspy.BootstrapFewShot(
metric=metric,
max_bootstrapped_demos=4,
max_labeled_demos=4,
)
optimized = optimizer.compile(my_program, trainset=trainset)optimizer = dspy.MIPROv2(
metric=metric,
auto="medium", # "light", "medium", "heavy"
)
optimized = optimizer.compile(my_program, trainset=trainset)"light""medium""heavy"optimizer = dspy.GEPA()
optimized = optimizer.compile(my_program, trainset=trainset, metric=metric)optimizer = dspy.BootstrapFinetune(metric=metric, num_threads=24)
optimized = optimizer.compile(my_program, trainset=trainset)/ai-fine-tuning| Symptom | Likely cause | Fix |
|---|---|---|
| Score stuck at 60-70% despite optimization | Task too complex for single step | |
| Optimizer overfits (train score high, dev score flat) | Too little training data | |
| Score varies wildly between runs | Non-deterministic metric or small devset | Increase devset to 100+, set temperature=0 |
| Diminishing returns from more demos | Prompt is maxed out; model is the limit | |
| Score high but real users complain | Metric doesn't match real quality | Rewrite metric based on actual failure patterns |
optimized_score = evaluator(optimized)
print(f"Baseline: {baseline_score:.1f}%")
print(f"Optimized: {optimized_score:.1f}%")
print(f"Improvement: {optimized_score - baseline_score:.1f}%")optimized.save("optimized_program.json")
# Load later
my_program = MyProgram()
my_program.load("optimized_program.json")display_table/ai-cutting-costs/ai-monitoring/ai-tracking-experiments/ai-decomposing-tasks/ai-fixing-errors