guardrails-safety-filter-builder
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseGuardrails & Safety Filter Builder
Guardrails与安全过滤器构建器
Build comprehensive safety systems for LLM applications.
为LLM应用构建全面的安全系统。
Safety Layers
安全防护层
- Input filtering: Block malicious prompts
- Output filtering: Redact sensitive data
- Topic constraints: Policy-based refusals
- PII detection: Mask personal information
- Prompt injection: Detect manipulation attempts
- 输入过滤:拦截恶意提示
- 输出过滤:脱敏敏感数据
- 主题约束:基于策略的拒绝
- PII检测:掩码个人信息
- 提示注入检测:识别操纵尝试
PII Detection & Redaction
PII检测与脱敏
python
import re
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def redact_pii(text: str) -> str:
# Detect PII
results = analyzer.analyze(
text=text,
language='en',
entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "SSN"]
)
# Anonymize
anonymized = anonymizer.anonymize(text, results)
return anonymized.textpython
import re
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def redact_pii(text: str) -> str:
# Detect PII
results = analyzer.analyze(
text=text,
language='en',
entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "SSN"]
)
# Anonymize
anonymized = anonymizer.anonymize(text, results)
return anonymized.textExample: "My email is john@example.com" → "My email is <EMAIL_ADDRESS>"
Example: "My email is john@example.com" → "My email is <EMAIL_ADDRESS>"
undefinedundefinedPrompt Injection Detection
提示注入检测
python
def detect_prompt_injection(user_input: str) -> bool:
"""Detect common prompt injection patterns"""
patterns = [
r'ignore (previous|above) instructions',
r'disregard (all|any) (prior|previous)',
r'you are now',
r'new instructions',
r'system:',
r'override',
]
for pattern in patterns:
if re.search(pattern, user_input, re.IGNORECASE):
return True
return Falsepython
def detect_prompt_injection(user_input: str) -> bool:
"""Detect common prompt injection patterns"""
patterns = [
r'ignore (previous|above) instructions',
r'disregard (all|any) (prior|previous)',
r'you are now',
r'new instructions',
r'system:',
r'override',
]
for pattern in patterns:
if re.search(pattern, user_input, re.IGNORECASE):
return True
return FalseBlock if detected
Block if detected
if detect_prompt_injection(user_input):
return "I cannot process that request."
undefinedif detect_prompt_injection(user_input):
return "I cannot process that request."
undefinedTopic Constraints
主题约束
python
undefinedpython
undefinedDefine allowed/disallowed topics
Define allowed/disallowed topics
POLICY = {
"allowed_topics": [
"product_features",
"troubleshooting",
"billing",
"account_management"
],
"disallowed_topics": [
"medical_advice",
"legal_advice",
"financial_advice",
"politics",
"violence"
],
"requires_disclaimer": [
"security_practices",
"data_privacy"
]
}
POLICY = {
"allowed_topics": [
"product_features",
"troubleshooting",
"billing",
"account_management"
],
"disallowed_topics": [
"medical_advice",
"legal_advice",
"financial_advice",
"politics",
"violence"
],
"requires_disclaimer": [
"security_practices",
"data_privacy"
]
}
Classify topic
Classify topic
def classify_topic(query: str) -> str:
classification_prompt = f"""
Classify this query into one of these topics:
{', '.join(POLICY['allowed_topics'] + POLICY['disallowed_topics'])}
Query: {query}
Return only the topic name.
"""
return llm(classification_prompt)def classify_topic(query: str) -> str:
classification_prompt = f"""
Classify this query into one of these topics:
{', '.join(POLICY['allowed_topics'] + POLICY['disallowed_topics'])}
Query: {query}
Return only the topic name.
"""
return llm(classification_prompt)Check policy
Check policy
def check_policy(query: str) -> dict:
topic = classify_topic(query)
if topic in POLICY["disallowed_topics"]:
return {
"allowed": False,
"reason": f"Cannot provide {topic}",
"refusal": REFUSAL_TEMPLATES[topic]
}
return {"allowed": True, "topic": topic}undefineddef check_policy(query: str) -> dict:
topic = classify_topic(query)
if topic in POLICY["disallowed_topics"]:
return {
"allowed": False,
"reason": f"Cannot provide {topic}",
"refusal": REFUSAL_TEMPLATES[topic]
}
return {"allowed": True, "topic": topic}undefinedRefusal Templates
拒绝模板
python
REFUSAL_TEMPLATES = {
"medical_advice": """
I cannot provide medical advice. Please consult with a healthcare
professional for medical concerns.
""",
"legal_advice": """
I cannot provide legal advice. For legal matters, please consult
with a qualified attorney.
""",
"out_of_scope": """
I'm designed to help with product documentation and support.
This question is outside my area of expertise.
""",
}
def refuse_safely(reason: str) -> str:
return REFUSAL_TEMPLATES.get(reason, REFUSAL_TEMPLATES["out_of_scope"])python
REFUSAL_TEMPLATES = {
"medical_advice": """
I cannot provide medical advice. Please consult with a healthcare
professional for medical concerns.
""",
"legal_advice": """
I cannot provide legal advice. For legal matters, please consult
with a qualified attorney.
""",
"out_of_scope": """
I'm designed to help with product documentation and support.
This question is outside my area of expertise.
""",
}
def refuse_safely(reason: str) -> str:
return REFUSAL_TEMPLATES.get(reason, REFUSAL_TEMPLATES["out_of_scope"])Output Validation
输出验证
python
def validate_output(output: str) -> dict:
"""Check output before returning to user"""
issues = []
# Check for PII
pii_results = analyzer.analyze(output, language='en')
if pii_results:
issues.append("Contains PII")
output = redact_pii(output)
# Check for banned phrases
banned_phrases = ["password", "api key", "secret"]
for phrase in banned_phrases:
if phrase.lower() in output.lower():
issues.append(f"Contains banned phrase: {phrase}")
# Check toxicity
toxicity_score = toxicity_classifier(output)
if toxicity_score > 0.7:
issues.append("High toxicity detected")
return {
"safe": len(issues) == 0,
"issues": issues,
"sanitized_output": output
}python
def validate_output(output: str) -> dict:
"""Check output before returning to user"""
issues = []
# Check for PII
pii_results = analyzer.analyze(output, language='en')
if pii_results:
issues.append("Contains PII")
output = redact_pii(output)
# Check for banned phrases
banned_phrases = ["password", "api key", "secret"]
for phrase in banned_phrases:
if phrase.lower() in output.lower():
issues.append(f"Contains banned phrase: {phrase}")
# Check toxicity
toxicity_score = toxicity_classifier(output)
if toxicity_score > 0.7:
issues.append("High toxicity detected")
return {
"safe": len(issues) == 0,
"issues": issues,
"sanitized_output": output
}Complete Guardrail Pipeline
完整的Guardrails防护流程
python
def apply_guardrails(user_input: str) -> dict:
# 1. Input validation
if detect_prompt_injection(user_input):
return {
"allowed": False,
"response": "Invalid request detected."
}
# 2. Policy check
policy_check = check_policy(user_input)
if not policy_check["allowed"]:
return {
"allowed": False,
"response": policy_check["refusal"]
}
# 3. Generate response
response = llm(user_input)
# 4. Output validation
validation = validate_output(response)
if not validation["safe"]:
return {
"allowed": True,
"response": validation["sanitized_output"],
"warnings": validation["issues"]
}
return {
"allowed": True,
"response": response
}python
def apply_guardrails(user_input: str) -> dict:
# 1. Input validation
if detect_prompt_injection(user_input):
return {
"allowed": False,
"response": "Invalid request detected."
}
# 2. Policy check
policy_check = check_policy(user_input)
if not policy_check["allowed"]:
return {
"allowed": False,
"response": policy_check["refusal"]
}
# 3. Generate response
response = llm(user_input)
# 4. Output validation
validation = validate_output(response)
if not validation["safe"]:
return {
"allowed": True,
"response": validation["sanitized_output"],
"warnings": validation["issues"]
}
return {
"allowed": True,
"response": response
}Best Practices
最佳实践
- Layer multiple defenses
- Log all blocked requests
- Provide helpful refusals
- Redact, don't reject when possible
- Regular pattern updates
- Human review of edge cases
- 多层防御机制
- 记录所有拦截请求
- 提供友好的拒绝回复
- 尽可能脱敏而非直接拒绝
- 定期更新检测规则
- 人工审核边缘案例
Output Checklist
输出检查清单
- PII detection implemented
- Prompt injection detection
- Topic classification
- Policy constraints defined
- Refusal templates written
- Output validation
- Logging/monitoring
- Test cases for bypasses
- 已实现PII检测
- 已实现提示注入检测
- 已实现主题分类
- 已定义策略约束
- 已编写拒绝模板
- 已实现输出验证
- 已配置日志/监控
- 已准备绕过测试用例