Loading...
Loading...
Track which optimization experiment was best. Use when you've run multiple optimization passes, need to compare experiments, want to reproduce past results, need to pick the best prompt configuration, track experiment costs, manage optimization artifacts, decide which optimized program to deploy, or justify your choice to stakeholders. Covers experiment logging, comparison, and promotion to production.
npx skill4agent add lebsral/dspy-programming-not-prompting-lms-skills ai-tracking-experimentsImproving accuracy ( | Tracking experiments (this skill) | |
|---|---|---|
| Focus | Running a single optimization pass | Managing the full experimental lifecycle |
| Output | An optimized program | A comparison of all runs with the winner promoted |
| Question | "How do I make this better?" | "Which of our 8 optimization runs was best?" |
import json
from datetime import datetime
EXPERIMENT_LOG = "experiments.jsonl"
def log_experiment(run):
"""Log a single experiment run."""
run["timestamp"] = datetime.now().isoformat()
with open(EXPERIMENT_LOG, "a") as f:
f.write(json.dumps(run) + "\n")
def load_experiments(path=EXPERIMENT_LOG):
"""Load all experiment runs."""
with open(path) as f:
return [json.loads(line) for line in f]run = {
"name": "mipro-medium-gpt4o-mini", # Human-readable name
"optimizer": "MIPROv2", # Which optimizer
"optimizer_config": {"auto": "medium"}, # Optimizer settings
"model": "openai/gpt-4o-mini", # Which LM
"trainset_size": 200, # Training examples used
"devset_size": 50, # Evaluation examples
"metric": "answer_quality", # Which metric
"score": 0.84, # Score on devset
"baseline_score": 0.65, # Score before optimization
"improvement": 0.19, # Delta
"cost_usd": 4.50, # API cost for this run
"duration_minutes": 12, # Wall clock time
"artifact_path": "artifacts/mipro_medium_gpt4o_mini.json", # Saved program
"notes": "Best so far. Instruction quality seems high.",
}
log_experiment(run)import dspy
import time
from dspy.evaluate import Evaluate
def run_experiment(
name,
program_class,
optimizer_class,
optimizer_kwargs,
trainset,
devset,
metric,
model="openai/gpt-4o-mini",
artifact_dir="artifacts",
):
"""Run one optimization experiment and log results."""
import os
os.makedirs(artifact_dir, exist_ok=True)
# Configure
lm = dspy.LM(model)
dspy.configure(lm=lm)
program = program_class()
# Baseline
evaluator = Evaluate(devset=devset, metric=metric, num_threads=4)
baseline_score = evaluator(program)
# Optimize
start = time.time()
optimizer = optimizer_class(**optimizer_kwargs)
if optimizer_class == dspy.GEPA:
optimized = optimizer.compile(program, trainset=trainset, metric=metric)
else:
optimized = optimizer.compile(program, trainset=trainset)
duration = (time.time() - start) / 60
# Evaluate optimized
score = evaluator(optimized)
# Save artifact
artifact_path = f"{artifact_dir}/{name}.json"
optimized.save(artifact_path)
# Log
run = {
"name": name,
"optimizer": optimizer_class.__name__,
"optimizer_config": optimizer_kwargs,
"model": model,
"trainset_size": len(trainset),
"devset_size": len(devset),
"metric": metric.__name__,
"baseline_score": baseline_score,
"score": score,
"improvement": score - baseline_score,
"duration_minutes": round(duration, 1),
"artifact_path": artifact_path,
}
log_experiment(run)
print(f"[{name}] {baseline_score:.1f}% -> {score:.1f}% (+{score - baseline_score:.1f}%)")
return optimized, runexperiments = [
{
"name": "bootstrap-4demos",
"optimizer_class": dspy.BootstrapFewShot,
"optimizer_kwargs": {"metric": metric, "max_bootstrapped_demos": 4},
},
{
"name": "bootstrap-8demos",
"optimizer_class": dspy.BootstrapFewShot,
"optimizer_kwargs": {"metric": metric, "max_bootstrapped_demos": 8},
},
{
"name": "mipro-light",
"optimizer_class": dspy.MIPROv2,
"optimizer_kwargs": {"metric": metric, "auto": "light"},
},
{
"name": "mipro-medium",
"optimizer_class": dspy.MIPROv2,
"optimizer_kwargs": {"metric": metric, "auto": "medium"},
},
]
results = []
for exp in experiments:
optimized, run = run_experiment(
name=exp["name"],
program_class=MyProgram,
optimizer_class=exp["optimizer_class"],
optimizer_kwargs=exp["optimizer_kwargs"],
trainset=trainset,
devset=devset,
metric=metric,
)
results.append(run)def compare_experiments(path=EXPERIMENT_LOG, sort_by="score"):
"""Load experiments and display a comparison table."""
runs = load_experiments(path)
runs.sort(key=lambda r: r.get(sort_by, 0), reverse=True)
# Header
print(f"{'Name':<30} {'Optimizer':<20} {'Model':<22} {'Score':>7} {'Improve':>8} {'Cost':>7}")
print("-" * 120)
for r in runs:
name = r.get("name", "?")[:29]
opt = r.get("optimizer", "?")[:19]
model = r.get("model", "?")[:21]
score = r.get("score", 0)
improvement = r.get("improvement", 0)
cost = r.get("cost_usd", 0)
print(f"{name:<30} {opt:<20} {model:<22} {score:>6.1f}% {improvement:>+7.1f}% ${cost:>5.2f}")
compare_experiments()
# Name Optimizer Model Score Improve Cost
# ------------------------------------------------------------------------------------------------------------------------
# mipro-medium MIPROv2 openai/gpt-4o-mini 84.0% +19.0% $4.50
# mipro-light MIPROv2 openai/gpt-4o-mini 78.0% +13.0% $1.20
# bootstrap-8demos BootstrapFewShot openai/gpt-4o-mini 74.0% +9.0% $0.30
# bootstrap-4demos BootstrapFewShot openai/gpt-4o-mini 71.0% +6.0% $0.15def filter_experiments(path=EXPERIMENT_LOG, **filters):
"""Filter experiments by any field."""
runs = load_experiments(path)
for key, value in filters.items():
if key == "min_score":
runs = [r for r in runs if r.get("score", 0) >= value]
elif key == "optimizer":
runs = [r for r in runs if r.get("optimizer") == value]
elif key == "model":
runs = [r for r in runs if r.get("model") == value]
return runs
# Only MIPROv2 runs
mipro_runs = filter_experiments(optimizer="MIPROv2")
# Runs scoring above 80%
good_runs = filter_experiments(min_score=80.0)import shutil
def promote_experiment(name, production_path="production/optimized.json"):
"""Copy the winning experiment's artifact to the production path."""
import os
runs = load_experiments()
run = next((r for r in runs if r["name"] == name), None)
if not run:
print(f"Experiment '{name}' not found")
return
os.makedirs(os.path.dirname(production_path), exist_ok=True)
shutil.copy2(run["artifact_path"], production_path)
# Log the promotion
promotion = {
"event": "promotion",
"experiment_name": name,
"score": run["score"],
"source_artifact": run["artifact_path"],
"production_path": production_path,
"timestamp": datetime.now().isoformat(),
}
with open("promotions.jsonl", "a") as f:
f.write(json.dumps(promotion) + "\n")
print(f"Promoted '{name}' (score: {run['score']:.1f}%) to {production_path}")
# Promote the best experiment
promote_experiment("mipro-medium")
# Promoted 'mipro-medium' (score: 84.0%) to production/optimized.json# In your production code
program = MyProgram()
program.load("production/optimized.json")pip install weaveimport weave
weave.init("my-project")
@weave.op()
def run_optimization(optimizer_name, model, trainset, devset, metric):
"""Tracked optimization run — Weave logs inputs, outputs, and cost."""
lm = dspy.LM(model)
dspy.configure(lm=lm)
program = MyProgram()
optimizer = dspy.MIPROv2(metric=metric, auto="medium")
optimized = optimizer.compile(program, trainset=trainset)
evaluator = Evaluate(devset=devset, metric=metric, num_threads=4)
score = evaluator(optimized)
return {"score": score, "optimizer": optimizer_name, "model": model}
# Weave auto-tracks everything — view at wandb.ai
result = run_optimization("mipro-medium", "openai/gpt-4o-mini", trainset, devset, metric)pip install langwatchimport langwatch
langwatch.init()
# LangWatch tracks DSPy optimizer steps in real-time
optimizer = dspy.MIPROv2(metric=metric, auto="heavy")
optimized = optimizer.compile(program, trainset=trainset)
# Watch progress at app.langwatch.ai/ai-improving-accuracy/ai-switching-models/ai-cutting-costs/ai-monitoring