Loading...
Loading...
Attach judges to AI Config variations for automatic LLM-as-a-judge evaluation. Create custom judges, configure sampling rates, and monitor quality scores.
npx skill4agent add launchdarkly/agent-skills aiconfig-online-evalsaiconfig-createLAUNCHDARKLY_API_KEYLAUNCHDARKLY_API_TOKENLD_API_KEY~/.claude/config.jsonmcpServers.launchdarkly.env.LAUNCHDARKLY_API_KEY{
"score": 0.85,
"reasoning": "Answered correctly with one minor omission"
}| Judge | Metric Key | Measures |
|---|---|---|
| Accuracy | | How correct and grounded the response is |
| Relevance | | How well it addresses the user request |
| Toxicity | | Harmful or unsafe phrasing (lower = safer) |
# Create judge config
curl -X POST "https://app.launchdarkly.com/api/v2/projects/{projectKey}/ai-configs" \
-H "Authorization: {api_token}" \
-H "Content-Type: application/json" \
-H "LD-API-Version: beta" \
-d '{
"key": "security-judge",
"name": "Security Judge",
"mode": "judge",
"evaluationMetricKey": "security",
"isInverted": false
}'Note: Setfor metrics like toxicity where 0.0 is better.isInverted: true
curl -X POST "https://app.launchdarkly.com/api/v2/projects/{projectKey}/ai-configs/security-judge/variations" \
-H "Authorization: {api_token}" \
-H "Content-Type: application/json" \
-H "LD-API-Version: beta" \
-d '{
"key": "default",
"name": "Default",
"messages": [
{
"role": "system",
"content": "You are a security auditor. Score from 0.0 to 1.0:\n- 1.0: No security issues\n- 0.7-0.9: Minor issues\n- 0.4-0.6: Moderate issues\n- 0.1-0.3: Serious vulnerabilities\n- 0.0: Critical vulnerabilities\n\nCheck for: SQL injection, XSS, hardcoded secrets, command injection."
}
],
"modelConfigKey": "OpenAI.gpt-4o-mini",
"model": {
"parameters": {
"temperature": 0.3
}
}
}'curl -X PATCH "https://app.launchdarkly.com/api/v2/projects/{projectKey}/ai-configs/{configKey}/variations/{variationKey}" \
-H "Authorization: {api_token}" \
-H "Content-Type: application/json" \
-H "LD-API-Version: beta" \
-d '{
"judgeConfiguration": {
"judges": [
{"judgeConfigKey": "security-judge", "samplingRate": 1.0},
{"judgeConfigKey": "api-contract-judge", "samplingRate": 0.5}
]
}
}'Important: Thearray replaces all existing judge attachments. An empty array removes all judges.judges
Note:does not work for AI Configs. UseturnTargetingOninstead.updateFallthroughVariationOrRollout
# First get the variation ID for "Default" from GET targeting response
curl -X PATCH "https://app.launchdarkly.com/api/v2/projects/{projectKey}/ai-configs/security-judge/targeting" \
-H "Authorization: {api_token}" \
-H "Content-Type: application/json; domain-model=launchdarkly.semanticpatch" \
-H "LD-API-Version: beta" \
-d '{
"environmentKey": "production",
"instructions": [{
"kind": "updateFallthroughVariationOrRollout",
"variationId": "your-default-variation-uuid"
}]
}'import requests
import os
from typing import Optional
class AIConfigJudges:
"""Manager for AI Config judge attachments"""
def __init__(self, api_token: str, project_key: str):
self.api_token = api_token
self.project_key = project_key
self.base_url = "https://app.launchdarkly.com/api/v2"
self.headers = {
"Authorization": api_token,
"Content-Type": "application/json",
"LD-API-Version": "beta"
}
def attach_judges(self, config_key: str, variation_key: str,
judges: list[dict]) -> dict:
"""
Attach judges to a variation.
Args:
config_key: AI Config key
variation_key: Variation key
judges: List of {"judgeConfigKey": str, "samplingRate": float}
"""
url = f"{self.base_url}/projects/{self.project_key}/ai-configs/{config_key}/variations/{variation_key}"
response = requests.patch(url, headers=self.headers, json={
"judgeConfiguration": {"judges": judges}
})
if response.status_code == 200:
print(f"[OK] Attached {len(judges)} judges to {config_key}/{variation_key}")
return response.json()
print(f"[ERROR] {response.status_code}: {response.text}")
return {}
def create_judge(self, key: str, name: str, metric_key: str,
system_prompt: str, model: str = "OpenAI.gpt-4o-mini",
is_inverted: bool = False) -> dict:
"""
Create a judge AI Config.
Args:
key: Judge config key
name: Display name
metric_key: Metric key for scoring (appears as $ld:ai:judge:{metric_key})
system_prompt: Evaluation instructions
is_inverted: True if lower scores are better (e.g., toxicity)
"""
# Create config
config_url = f"{self.base_url}/projects/{self.project_key}/ai-configs"
response = requests.post(config_url, headers=self.headers, json={
"key": key,
"name": name,
"mode": "judge",
"evaluationMetricKey": metric_key,
"isInverted": is_inverted
})
if response.status_code not in [200, 201]:
print(f"[ERROR] Creating config: {response.text}")
return {}
# Create variation
var_url = f"{self.base_url}/projects/{self.project_key}/ai-configs/{key}/variations"
response = requests.post(var_url, headers=self.headers, json={
"key": "default",
"name": "Default",
"messages": [{"role": "system", "content": system_prompt}],
"modelConfigKey": model,
"model": {"parameters": {"temperature": 0.3}}
})
if response.status_code in [200, 201]:
print(f"[OK] Created judge: {key}")
return response.json()
print(f"[ERROR] Creating variation: {response.text}")
return {}
def set_fallthrough(self, config_key: str, environment: str,
variation_key: str = "default") -> bool:
"""
Set fallthrough to enable a judge config.
Note: turnTargetingOn doesn't work for AI Configs. Instead, set the
fallthrough from disabled (index 0) to the enabled variation.
"""
# Get variation ID
url = f"{self.base_url}/projects/{self.project_key}/ai-configs/{config_key}/targeting"
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
print(f"[ERROR] {response.status_code}: {response.text}")
return False
targeting = response.json()
variation_id = None
for var in targeting.get("variations", []):
if var.get("key") == variation_key or var.get("name") == variation_key:
variation_id = var.get("_id")
break
if not variation_id:
print(f"[ERROR] Variation '{variation_key}' not found")
return False
# Set fallthrough
response = requests.patch(url, headers={
**self.headers,
"Content-Type": "application/json; domain-model=launchdarkly.semanticpatch"
}, json={
"environmentKey": environment,
"instructions": [{
"kind": "updateFallthroughVariationOrRollout",
"variationId": variation_id
}]
})
if response.status_code == 200:
print(f"[OK] Fallthrough set for {config_key}")
return True
print(f"[ERROR] {response.status_code}: {response.text}")
return Falsecreate_chat()invoke()import os
import json
import asyncio
import ldclient
from ldclient import Context
from ldclient.config import Config
from ldai import LDAIClient, AICompletionConfigDefault
sdk_key = os.getenv('LAUNCHDARKLY_SDK_KEY')
ai_config_key = os.getenv('LAUNCHDARKLY_AI_CONFIG_KEY', 'sample-ai-config')
async def async_main():
ldclient.set_config(Config(sdk_key))
aiclient = LDAIClient(ldclient.get())
context = (
Context.builder('example-user-key')
.kind('user')
.name('Sandy')
.build()
)
default_value = AICompletionConfigDefault(enabled=False)
# create_chat() initializes with judges from AI Config
chat = await aiclient.create_chat(ai_config_key, context, default_value, {})
if not chat:
print(f"AI chat configuration not enabled for: {ai_config_key}")
return
user_input = 'How can LaunchDarkly help me?'
# invoke() automatically evaluates with attached judges
chat_response = await chat.invoke(user_input)
print("Response:", chat_response.message.content)
# Await evaluation results
if chat_response.evaluations and len(chat_response.evaluations) > 0:
eval_results = await asyncio.gather(*chat_response.evaluations)
results_to_display = [
result.to_dict() if result is not None else "not evaluated"
for result in eval_results
]
print("Judge results:")
print(json.dumps(results_to_display, indent=2, default=str))
ldclient.get().close()import os
import json
import asyncio
import ldclient
from ldclient import Context
from ldclient.config import Config
from ldai import LDAIClient, AICompletionConfigDefault
sdk_key = os.getenv('LAUNCHDARKLY_SDK_KEY')
judge_key = os.getenv('LAUNCHDARKLY_AI_JUDGE_KEY', 'sample-ai-judge-accuracy')
async def async_main():
ldclient.set_config(Config(sdk_key))
aiclient = LDAIClient(ldclient.get())
context = (
Context.builder('example-user-key')
.kind('user')
.name('Sandy')
.build()
)
judge_default_value = AICompletionConfigDefault(enabled=False)
# Get judge configuration from LaunchDarkly
judge = await aiclient.create_judge(judge_key, context, judge_default_value)
if not judge:
print(f"AI judge configuration not enabled for key: {judge_key}")
return
input_text = 'You are a helpful assistant. How can you help me?'
output_text = 'I can answer any question you have.'
# Evaluate the input/output pair
judge_response = await judge.evaluate(input_text, output_text)
if judge_response is None:
print("Judge evaluation was skipped (sample rate or configuration issue)")
return
# Track scores on the AI Config tracker if needed:
# aiConfig.tracker.track_eval_scores(judge_response.evals)
print("Judge Response:")
print(json.dumps(judge_response.to_dict(), indent=2, default=str))
ldclient.get().close()Note: Direct evaluation does not automatically record metrics. Useto record scores for the AI Config you're evaluating.tracker.track_eval_scores()
| Status | Cause | Solution |
|---|---|---|
| 404 | Config/variation not found | Verify keys exist |
| 400 | Invalid judge config | Check judgeConfigKey exists |
| 403 | Insufficient permissions | Check API token permissions |
| 422 | Duplicate metric key | Cannot attach multiple judges with same metric key |
aiconfig-createaiconfig-targetingaiconfig-variations