Loading...
Loading...
Use this skill for ANY question about CREATING evaluators. Covers creating custom metrics, LLM as Judge evaluators, code-based evaluators, and uploading evaluation logic to LangSmith. Includes basic usage of evaluators to run evaluations.
npx skill4agent add langchain-ai/langchain-skills langsmith-evaluatorLANGSMITH_API_KEY=lsv2_pt_your_api_key_here # Required
LANGSMITH_WORKSPACE_ID=your-workspace-id # Optional: for org-scoped keys
OPENAI_API_KEY=your_openai_key # For LLM as Judgepip install langsmith langchain-openai python-dotenvdef evaluator_name(inputs: dict, outputs: dict, reference_outputs: dict = None) -> dict:
"""Evaluate a single prediction."""
user_query = inputs.get("query", "")
agent_response = outputs.get("expected_response", "")
expected = reference_outputs.get("expected_response", "") if reference_outputs else None
return {
"key": "metric_name", # Metric identifier
"score": 0.85, # Number or boolean
"comment": "Reason..." # Optional explanation
}def evaluator_name(run, example):
"""Evaluate using run/example dicts.
Args:
run: Dict with run["outputs"] containing agent outputs
example: Dict with example["outputs"] containing expected outputs
"""
agent_response = run["outputs"].get("expected_response", "")
expected = example["outputs"].get("expected_response", "")
return {
"metric_name": 0.85, # Metric name as key directly
"comment": "Reason..." # Optional explanation
}from typing import TypedDict, Annotated
from langchain_openai import ChatOpenAI
class AccuracyGrade(TypedDict):
"""Structured evaluation output."""
reasoning: Annotated[str, ..., "Explain your reasoning"]
is_accurate: Annotated[bool, ..., "True if response is accurate"]
confidence: Annotated[float, ..., "Confidence 0.0-1.0"]
# Configure model with structured output
judge = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(
AccuracyGrade, method="json_schema", strict=True
)
async def accuracy_evaluator(run, example):
"""Evaluate factual accuracy for LangSmith upload."""
expected = example["outputs"].get('expected_response', '')
agent_output = run["outputs"].get('expected_response', '')
prompt = f"""Expected: {expected}
Agent Output: {agent_output}
Evaluate accuracy:"""
grade = await judge.ainvoke([{"role": "user", "content": prompt}])
return {
"accuracy": 1 if grade["is_accurate"] else 0,
"comment": f"{grade['reasoning']} (confidence: {grade['confidence']})"
}def exact_match_evaluator(run, example):
"""Check if output exactly matches expected."""
output = run["outputs"].get("expected_response", "").strip().lower()
expected = example["outputs"].get("expected_response", "").strip().lower()
match = output == expected
return {
"exact_match": 1 if match else 0,
"comment": f"Match: {match}"
}def trajectory_evaluator(run, example):
"""Evaluate tool call sequence."""
trajectory = run["outputs"].get("expected_trajectory", [])
expected = example["outputs"].get("expected_trajectory", [])
# Exact sequence match
exact = trajectory == expected
# All required tools used (order-agnostic)
all_tools = set(expected).issubset(set(trajectory))
# Efficiency: count extra steps
extra_steps = len(trajectory) - len(expected)
return {
"trajectory_match": 1 if exact else 0,
"comment": f"Exact: {exact}, All tools: {all_tools}, Extra: {extra_steps}"
}def single_step_evaluator(run, example):
"""Evaluate single node output."""
output = run["outputs"].get("output", {})
expected = example["outputs"].get("expected_output", {})
node_name = run["outputs"].get("node_name", "")
# For classification nodes
if "classification" in node_name:
classification = output.get("classification", "")
expected_class = expected.get("classification", "")
match = classification.lower() == expected_class.lower()
return {
"classification_correct": 1 if match else 0,
"comment": f"Output: {classification}, Expected: {expected_class}"
}
# For other nodes
match = output == expected
return {
"output_match": 1 if match else 0,
"comment": f"Match: {match}"
}from langsmith import Client
client = Client()
# Define your agent function
def run_agent(inputs: dict) -> dict:
"""Your agent invocation logic."""
result = your_agent.invoke(inputs)
return {"expected_response": result}
# Run evaluation
results = await client.aevaluate(
run_agent,
data="Skills: Final Response", # Dataset name
evaluators=[
exact_match_evaluator,
accuracy_evaluator,
trajectory_evaluator
],
experiment_prefix="skills-eval-v1",
max_concurrency=4
)skills/langsmith-evaluator/scripts/(run, example)runrun["outputs"]exampleexample["outputs"]# my_project/evaluators/custom_evals.py
def my_custom_evaluator(run, example):
"""Your custom evaluation logic.
Args:
run: Dict with run["outputs"] - agent outputs
example: Dict with example["outputs"] - expected outputs
Returns:
Dict with metric_name as key, score as value, optional comment
"""
# Extract relevant data
agent_output = run["outputs"].get("expected_trajectory", [])
expected = example["outputs"].get("expected_trajectory", [])
# Your custom logic here
match = agent_output == expected
return {
"my_metric": 1 if match else 0,
"comment": "Custom reasoning here"
}# List existing evaluators
python upload_evaluators.py list
# Upload evaluator
python upload_evaluators.py upload my_evaluators.py \
--name "Trajectory Match" \
--function trajectory_match \
--dataset "Skills: Trajectory" \
--replace
# Delete evaluator (will prompt for confirmation)
python upload_evaluators.py delete "Trajectory Match"
# Skip confirmation prompts (use with caution)
python upload_evaluators.py delete "Trajectory Match" --yes
python upload_evaluators.py upload my_evaluators.py \
--name "Trajectory Match" \
--function trajectory_match \
--replace --yes--name--function--dataset--project--sample-rate--replace--yes--yes--yes# 1. Create evaluators file
cat > evaluators.py <<'EOF'
def exact_match(run, example):
"""Check if output exactly matches expected."""
output = run["outputs"].get("expected_response", "").strip().lower()
expected = example["outputs"].get("expected_response", "").strip().lower()
match = output == expected
return {
"exact_match": 1 if match else 0,
"comment": f"Match: {match}"
}
EOF
# 2. Upload to LangSmith
python upload_evaluators.py upload evaluators.py \
--name "Exact Match" \
--function exact_match \
--dataset "Skills: Final Response" \
--replace
# 3. Evaluator runs automatically on new dataset runs