Loading...
Loading...
Create code-based evaluators for LangSmith-traced agents with step-by-step collaborative guidance through inspection, evaluation logic, and testing.
npx skill4agent add langchain-ai/lca-skills langsmith-code-evalpython scripts/inspect_trace.py PROJECT_NAMErun.outputs["messages"]run.child_runsfrom inspect_trace import inspect_trace_structure
structure = inspect_trace_structure("project-name")
if "extract_from_messages" in structure["recommendations"]:
# Tool calls are in run.outputs["messages"]eval_[name].pyfrom langsmith.schemas import Run, Example
def evaluate_[name](run: Run, example: Example) -> dict:
"""Evaluate [specific behavior]."""
# Extract data (based on Step 4)
messages = run.outputs.get("messages", [])
category = example.metadata.get("category") if example.metadata else None
# Evaluation logic (based on Step 5)
# ...
return {
"key": "evaluator_name",
"score": 1 or 0, # 1 = pass, 0 = fail
"comment": "Specific feedback explaining the score"
}for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
tool_name = tc["function"]["name"]
args = json.loads(tc["function"]["arguments"])category = example.metadata.get("category", "unknown")
if category == "stock":
score = 1 if made_db_call else 0
elif category == "weather":
score = 1 if not made_db_call else 0run_experiment_with_eval.pyimport asyncio
from langsmith import aevaluate
from [agent_module] import [agent_function]
from eval_[name] import evaluate_[name]
from dotenv import load_dotenv
load_dotenv()
async def agent_wrapper(inputs: dict) -> dict:
result = await [agent_function](inputs["question"])
return result
async def main():
results = await aevaluate(
agent_wrapper,
data="DATASET_NAME",
evaluators=[evaluate_[name]],
experiment_prefix="eval-test",
max_concurrency=5,
)
print(f"Results: {results}")
if __name__ == "__main__":
asyncio.run(main())uv run python run_experiment_with_eval.pymessages = run.outputs.get("messages", [])
for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
# Tool calls are heredef traverse_runs(run):
if run.name == "tool_name":
# Found it
if hasattr(run, 'child_runs') and run.child_runs:
for child in run.child_runs:
traverse_runs(child)category = example.metadata.get("category") if example.metadata else Noneinspect_trace.pyagent_v4.pyeval_tool_call_schema.pyrun_experiment_with_code_eval.py