Loading...
Loading...
Compare original and translation side by side
import dspy
from typing import Literal
VIOLATIONS = Literal[
"safe", "spam", "hate_speech", "harassment",
"violence", "nsfw", "self_harm", "illegal",
]
class ModerateContent(dspy.Signature):
"""Assess user-generated content against platform policies."""
content: str = dspy.InputField(desc="user-generated content to moderate")
platform_context: str = dspy.InputField(desc="where this content appears, e.g. 'product review'")
violation_type: VIOLATIONS = dspy.OutputField()
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField()
explanation: str = dspy.OutputField(desc="brief reason for the decision")
class ContentModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(ModerateContent)
def forward(self, content, platform_context="social media post"):
result = self.assess(content=content, platform_context=platform_context)
# Route based on severity
if result.severity == "high":
decision = "remove"
elif result.severity == "medium":
decision = "human_review"
elif result.severity == "low":
decision = "warn"
else:
decision = "approve"
return dspy.Prediction(
violation_type=result.violation_type,
severity=result.severity,
decision=decision,
explanation=result.explanation,
)import dspy
from typing import Literal
VIOLATIONS = Literal[
"safe", "spam", "hate_speech", "harassment",
"violence", "nsfw", "self_harm", "illegal",
]
class ModerateContent(dspy.Signature):
"""Assess user-generated content against platform policies."""
content: str = dspy.InputField(desc="user-generated content to moderate")
platform_context: str = dspy.InputField(desc="where this content appears, e.g. 'product review'")
violation_type: VIOLATIONS = dspy.OutputField()
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField()
explanation: str = dspy.OutputField(desc="brief reason for the decision")
class ContentModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(ModerateContent)
def forward(self, content, platform_context="social media post"):
result = self.assess(content=content, platform_context=platform_context)
# Route based on severity
if result.severity == "high":
decision = "remove"
elif result.severity == "medium":
decision = "human_review"
elif result.severity == "low":
decision = "warn"
else:
decision = "approve"
return dspy.Prediction(
violation_type=result.violation_type,
severity=result.severity,
decision=decision,
explanation=result.explanation,
)undefinedundefinedVIOLATION_TYPES = ["safe", "spam", "hate_speech", "harassment", "violence", "nsfw", "self_harm", "illegal"]
class MultiLabelModerate(dspy.Signature):
"""Flag all policy violations in user content. Content may have multiple violations."""
content: str = dspy.InputField()
platform_context: str = dspy.InputField()
violations: list[str] = dspy.OutputField(desc=f"all that apply from: {VIOLATION_TYPES}")
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField(
desc="overall severity based on the worst violation"
)
explanation: str = dspy.OutputField()
class MultiLabelModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(MultiLabelModerate)
def forward(self, content, platform_context=""):
result = self.assess(content=content, platform_context=platform_context)
# Validate that returned violations are from the allowed set
dspy.Assert(
all(v in VIOLATION_TYPES for v in result.violations),
f"Violations must be from: {VIOLATION_TYPES}",
)
return resultVIOLATION_TYPES = ["safe", "spam", "hate_speech", "harassment", "violence", "nsfw", "self_harm", "illegal"]
class MultiLabelModerate(dspy.Signature):
"""Flag all policy violations in user content. Content may have multiple violations."""
content: str = dspy.InputField()
platform_context: str = dspy.InputField()
violations: list[str] = dspy.OutputField(desc=f"all that apply from: {VIOLATION_TYPES}")
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField(
desc="overall severity based on the worst violation"
)
explanation: str = dspy.OutputField()
class MultiLabelModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(MultiLabelModerate)
def forward(self, content, platform_context=""):
result = self.assess(content=content, platform_context=platform_context)
# Validate that returned violations are from the allowed set
dspy.Assert(
all(v in VIOLATION_TYPES for v in result.violations),
f"Violations must be from: {VIOLATION_TYPES}",
)
return resultimport re
class StrictModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(ModerateContent)
def forward(self, content, platform_context=""):
# Pattern-based hard blocks (instant, no LM needed)
dspy.Assert(
not re.search(r"\b\d{3}-\d{2}-\d{4}\b", content),
"Content contains SSN pattern — auto-reject",
)
dspy.Assert(
not re.search(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
content,
),
"Content contains email addresses — redact before posting",
)
dspy.Assert(
not re.search(r"\b\d{16}\b", content),
"Content contains potential credit card number — auto-reject",
)
# LM-based assessment for everything else
return self.assess(content=content, platform_context=platform_context)import re
class StrictModerator(dspy.Module):
def __init__(self):
self.assess = dspy.ChainOfThought(ModerateContent)
def forward(self, content, platform_context=""):
# Pattern-based hard blocks (instant, no LM needed)
dspy.Assert(
not re.search(r"\b\d{3}-\d{2}-\d{4}\b", content),
"Content contains SSN pattern — auto-reject",
)
dspy.Assert(
not re.search(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
content,
),
"Content contains email addresses — redact before posting",
)
dspy.Assert(
not re.search(r"\b\d{16}\b", content),
"Content contains potential credit card number — auto-reject",
)
# LM-based assessment for everything else
return self.assess(content=content, platform_context=platform_context)class ConfidentModerate(dspy.Signature):
"""Moderate content and rate your confidence in the assessment."""
content: str = dspy.InputField()
platform_context: str = dspy.InputField()
violation_type: VIOLATIONS = dspy.OutputField()
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField()
confidence: float = dspy.OutputField(desc="0.0 to 1.0 — how sure are you about this assessment?")
explanation: str = dspy.OutputField()
class ConfidentModerator(dspy.Module):
def __init__(self, confidence_threshold=0.7):
self.assess = dspy.ChainOfThought(ConfidentModerate)
self.confidence_threshold = confidence_threshold
def forward(self, content, platform_context=""):
result = self.assess(content=content, platform_context=platform_context)
# Validate confidence range
dspy.Assert(
0.0 <= result.confidence <= 1.0,
"Confidence must be between 0.0 and 1.0",
)
# Route based on confidence + severity
if result.confidence < self.confidence_threshold:
decision = "human_review" # uncertain → always escalate
elif result.severity == "high":
decision = "remove"
elif result.severity == "medium":
decision = "human_review"
elif result.severity == "low":
decision = "warn"
else:
decision = "approve"
return dspy.Prediction(
violation_type=result.violation_type,
severity=result.severity,
confidence=result.confidence,
decision=decision,
explanation=result.explanation,
)class ConfidentModerate(dspy.Signature):
"""Moderate content and rate your confidence in the assessment."""
content: str = dspy.InputField()
platform_context: str = dspy.InputField()
violation_type: VIOLATIONS = dspy.OutputField()
severity: Literal["none", "low", "medium", "high"] = dspy.OutputField()
confidence: float = dspy.OutputField(desc="0.0 to 1.0 — how sure are you about this assessment?")
explanation: str = dspy.OutputField()
class ConfidentModerator(dspy.Module):
def __init__(self, confidence_threshold=0.7):
self.assess = dspy.ChainOfThought(ConfidentModerate)
self.confidence_threshold = confidence_threshold
def forward(self, content, platform_context=""):
result = self.assess(content=content, platform_context=platform_context)
# Validate confidence range
dspy.Assert(
0.0 <= result.confidence <= 1.0,
"Confidence must be between 0.0 and 1.0",
)
# Route based on confidence + severity
if result.confidence < self.confidence_threshold:
decision = "human_review" # uncertain → always escalate
elif result.severity == "high":
decision = "remove"
elif result.severity == "medium":
decision = "human_review"
elif result.severity == "low":
decision = "warn"
else:
decision = "approve"
return dspy.Prediction(
violation_type=result.violation_type,
severity=result.severity,
confidence=result.confidence,
decision=decision,
explanation=result.explanation,
)def moderation_metric(example, prediction, trace=None):
"""Weighted score: type matters more than severity."""
type_correct = float(prediction.violation_type == example.violation_type)
severity_correct = float(prediction.severity == example.severity)
return 0.7 * type_correct + 0.3 * severity_correctdef moderation_metric(example, prediction, trace=None):
"""Weighted score: type matters more than severity."""
type_correct = float(prediction.violation_type == example.violation_type)
severity_correct = float(prediction.severity == example.severity)
return 0.7 * type_correct + 0.3 * severity_correctdef make_category_metric(category):
"""Create a precision metric for a specific violation category."""
def metric(example, prediction, trace=None):
# Did we correctly identify this category?
if example.violation_type == category:
return float(prediction.violation_type == category) # recall
else:
return float(prediction.violation_type != category) # precision
return metricdef make_category_metric(category):
"""Create a precision metric for a specific violation category."""
def metric(example, prediction, trace=None):
# Did we correctly identify this category?
if example.violation_type == category:
return float(prediction.violation_type == category) # recall
else:
return float(prediction.violation_type != category) # precision
return metricundefinedundefinedundefinedundefinedundefinedundefinedplatform_context/ai-testing-safetyplatform_context/ai-testing-safety/ai-monitoring/ai-testing-safety/ai-monitoring/ai-testing-safety/ai-sorting/ai-checking-outputs/ai-testing-safety/ai-monitoringexamples.md/ai-sorting/ai-checking-outputs/ai-testing-safety/ai-monitoringexamples.md