Loading...
Loading...
Generate synthetic training data when you don't have enough real examples. Use when you're starting from scratch with no data, need a proof of concept fast, have too few examples for optimization, can't use real customer data for privacy or compliance, need to fill gaps in edge cases, have unbalanced categories, added new categories, or changed your schema. Covers DSPy synthetic data generation, quality filtering, and bootstrapping from zero.
npx skill4agent add lebsral/dspy-programming-not-prompting-lms-skills ai-generating-dataimport dspy
# Your task — what the AI will do in production
class ClassifyTicket(dspy.Signature):
"""Classify a support ticket into a category."""
ticket_text: str = dspy.InputField()
category: str = dspy.OutputField()
# Generator — produces examples for your task
class GenerateTicketExample(dspy.Signature):
"""Generate a realistic support ticket with its correct category."""
category: str = dspy.InputField(desc="the target category to generate an example for")
ticket_text: str = dspy.OutputField(desc="a realistic support ticket for this category")# Task: extract structured data from text
class ExtractContact(dspy.Signature):
"""Extract contact info from a message."""
message: str = dspy.InputField()
name: str = dspy.OutputField()
email: str = dspy.OutputField()
phone: str = dspy.OutputField()
# Generator: produce realistic messages with known contact info
class GenerateContactExample(dspy.Signature):
"""Generate a realistic message that contains contact information."""
name: str = dspy.InputField(desc="the person's name to embed in the message")
email: str = dspy.InputField(desc="the email address to embed in the message")
phone: str = dspy.InputField(desc="the phone number to embed in the message")
message: str = dspy.OutputField(desc="a realistic message containing this contact info")seeds = [
dspy.Example(
ticket_text="I was charged twice for my subscription this month. Order #4521.",
category="billing"
).with_inputs("ticket_text"),
dspy.Example(
ticket_text="The app crashes when I try to upload a profile photo on Android.",
category="bug"
).with_inputs("ticket_text"),
dspy.Example(
ticket_text="How do I export my data to CSV? I can't find the option anywhere.",
category="how-to"
).with_inputs("ticket_text"),
dspy.Example(
ticket_text="I'd love to see dark mode added. The white background hurts my eyes.",
category="feature-request"
).with_inputs("ticket_text"),
dspy.Example(
ticket_text="My account got locked after too many login attempts. Please help.",
category="account"
).with_inputs("ticket_text"),
]n=Nngenerator = dspy.Predict(GenerateTicketExample, n=20)
response = generator(category="billing")
examples = [
dspy.Example(ticket_text=t, category="billing").with_inputs("ticket_text")
for t in response.completions.ticket_text
]examples = []
categories = ["billing", "bug", "how-to", "feature-request", "account"]
for category in categories:
generator = dspy.Predict(GenerateTicketExample)
for i in range(40):
result = generator(category=category)
examples.append(
dspy.Example(ticket_text=result.ticket_text, category=category)
.with_inputs("ticket_text")
)
print(f"Generated {len(examples)} examples")nfor category in categories:
for i in range(50):
result = generator(category=category)
examples.append(dspy.Example(ticket_text=result.ticket_text, category=category).with_inputs("ticket_text"))class GenerateVariation(dspy.Signature):
"""Generate a variation of this support ticket with a different tone and phrasing."""
original_ticket: str = dspy.InputField(desc="the original ticket to vary")
variation_type: str = dspy.InputField(desc="how to vary it: tone, length, complexity, or language")
ticket_text: str = dspy.OutputField(desc="a new ticket with the same meaning but different style")
vary = dspy.Predict(GenerateVariation)
for seed in seeds:
for variation in ["angry tone", "very brief", "verbose and detailed", "non-native English"]:
result = vary(original_ticket=seed.ticket_text, variation_type=variation)
examples.append(dspy.Example(ticket_text=result.ticket_text, category=seed.category).with_inputs("ticket_text"))class GenerateScenarioTicket(dspy.Signature):
"""Generate a support ticket matching a specific scenario."""
category: str = dspy.InputField()
scenario: str = dspy.InputField(desc="the specific scenario to generate")
ticket_text: str = dspy.OutputField()
gen = dspy.Predict(GenerateScenarioTicket)
scenarios = [
("billing", "customer charged in wrong currency"),
("billing", "refund for a cancelled subscription"),
("bug", "issue only happens on slow network connections"),
("bug", "multi-step reproduction involving two features"),
("how-to", "customer is non-technical and confused by jargon"),
]
for category, scenario in scenarios:
result = gen(category=category, scenario=scenario)
examples.append(dspy.Example(ticket_text=result.ticket_text, category=category).with_inputs("ticket_text"))class GenerateByDifficulty(dspy.Signature):
"""Generate a support ticket at a specific difficulty level for classification."""
category: str = dspy.InputField()
difficulty: str = dspy.InputField(desc="easy (clear-cut), medium (some ambiguity), or hard (could be multiple categories)")
ticket_text: str = dspy.OutputField()
gen = dspy.Predict(GenerateByDifficulty)
for category in categories:
for difficulty in ["easy", "medium", "hard"]:
for i in range(15):
result = gen(category=category, difficulty=difficulty)
examples.append(dspy.Example(ticket_text=result.ticket_text, category=category).with_inputs("ticket_text"))sindeximport random
class GenerateDiverse(dspy.Signature):
"""Generate a unique and realistic support ticket."""
category: str = dspy.InputField()
sindex: str = dspy.InputField(desc="a unique seed index for diversity")
ticket_text: str = dspy.OutputField()
gen = dspy.Predict(GenerateDiverse)
for category in categories:
for i in range(50):
result = gen(category=category, sindex=str(random.randint(0, 1_000_000)))
examples.append(dspy.Example(ticket_text=result.ticket_text, category=category).with_inputs("ticket_text"))sindexprogram = dspy.ChainOfThought(ClassifyTicket)
filtered = []
for ex in examples:
pred = program(**ex.inputs())
if metric(ex, pred):
filtered.append(ex)
print(f"Kept {len(filtered)}/{len(examples)} ({100*len(filtered)//len(examples)}%)")class AssessExample(dspy.Signature):
"""Is this a realistic and correctly labeled example?"""
ticket_text: str = dspy.InputField()
category: str = dspy.InputField()
is_realistic: bool = dspy.OutputField(desc="true if this looks like a real support ticket")
is_correctly_labeled: bool = dspy.OutputField(desc="true if the category matches the ticket")
assessor = dspy.Predict(AssessExample)
filtered = []
for ex in examples:
result = assessor(ticket_text=ex.ticket_text, category=ex.category)
if result.is_realistic and result.is_correctly_labeled:
filtered.append(ex)
print(f"Kept {len(filtered)}/{len(examples)} ({100*len(filtered)//len(examples)}%)")dspy.SuggestSuggestclass QualityGenerator(dspy.Module):
def __init__(self):
self.generate = dspy.ChainOfThought(GenerateTicketExample)
self.assess = dspy.Predict(AssessExample)
def forward(self, category):
result = self.generate(category=category)
assessment = self.assess(ticket_text=result.ticket_text, category=category)
dspy.Suggest(assessment.is_realistic, "Generated ticket should be realistic")
dspy.Suggest(assessment.is_correctly_labeled, "Category label should be correct")
return result
generator = QualityGenerator()
# DSPy retries generation when Suggest constraints failseen = set()
unique = []
for ex in filtered:
# Normalize and check
key = ex.ticket_text.strip().lower()
if key not in seen:
seen.add(key)
unique.append(ex)
print(f"Removed {len(filtered) - len(unique)} near-duplicates")
filtered = uniqueclass DataGenerator(dspy.Module):
def __init__(self):
self.generate = dspy.ChainOfThought(GenerateTicketExample)
def forward(self, category):
return self.generate(category=category)
# Define a metric that measures generated data quality
def generator_metric(example, prediction, trace=None):
# Check if a downstream classifier gets the right answer on this generated example
classifier = dspy.Predict(ClassifyTicket)
task_example = dspy.Example(ticket_text=prediction.ticket_text, category=example.category).with_inputs("ticket_text")
task_pred = classifier(**task_example.inputs())
return task_pred.category.lower() == example.category.lower()
# Optimize the generator's prompts
optimizer = dspy.BootstrapFewShot(metric=generator_metric)
optimized_generator = optimizer.compile(DataGenerator(), trainset=seeds)
# Now generate with the optimized generator
better_examples = []
for category in categories:
for i in range(50):
result = optimized_generator(category=category)
better_examples.append(
dspy.Example(ticket_text=result.ticket_text, category=category).with_inputs("ticket_text")
)import random
from dspy.evaluate import Evaluate
# Shuffle and split
random.shuffle(filtered)
split = int(len(filtered) * 0.8)
trainset = filtered[:split]
devset = filtered[split:]
print(f"Train: {len(trainset)}, Dev: {len(devset)}")
# Configure your task LM (can be cheaper than the generator LM)
lm = dspy.LM("openai/gpt-4o-mini")
dspy.configure(lm=lm)
# Build and optimize your task program
program = dspy.ChainOfThought(ClassifyTicket)
optimizer = dspy.MIPROv2(metric=metric, auto="medium")
optimized = optimizer.compile(program, trainset=trainset)
# Evaluate
evaluator = Evaluate(devset=devset, metric=metric, num_threads=4, display_progress=True)
score = evaluator(optimized)
print(f"Score on synthetic dev set: {score:.1f}%")
# Save
optimized.save("optimized_program.json")dspy.Suggestn/ai-improving-accuracy/ai-fine-tuning/ai-kickoff