Loading...
Loading...
Use this skill for ANY question about creating test or evaluation datasets for LangChain agents. Covers generating datasets from traces (final_response, single_step, trajectory, RAG types), uploading to LangSmith, and managing evaluation data.
npx skill4agent add jackjin1997/clawforge langsmith-datasetLANGSMITH_API_KEY=lsv2_pt_your_api_key_here # Required
LANGSMITH_PROJECT=your-project-name # Optional: default project
LANGSMITH_WORKSPACE_ID=your-workspace-id # Optional: for org-scoped keyspip install langsmith click rich python-dotenvskills/langsmith-dataset/scripts/generate_datasets.pyquery_datasets.py--root-run-name <name>--limit <n>--last-n-minutes <n>--output <path>--upload <name>--replace--yes--replace--yes--yesDepth 0: Root agent (e.g., "LangGraph")
├── Depth 1: Middleware/chains (model, tools, SummarizationMiddleware)
│ ├── Depth 2: Tool calls (sql_db_query, retriever, etc.)
│ └── Depth 2: LLM calls (ChatOpenAI, ChatAnthropic)
└── Depth 3+: Nested subagent calls--root-run-name--root-run-name LangGraph# Basic usage
python generate_datasets.py --type final_response \
--project my-project \
--root-run-name LangGraph \
--limit 30 \
--output /tmp/final_response.json
# With custom output fields
python generate_datasets.py --type final_response \
--project my-project \
--output-fields "answer,result" \
--output /tmp/final.json
# Messages only (ignore output dict keys)
python generate_datasets.py --type final_response \
--project my-project \
--messages-only \
--output /tmp/final.json{
"trace_id": "...",
"inputs": {"query": "What are the top 3 genres?"},
"outputs": {
"expected_response": "The top 3 genres based on the number of tracks are:\n\n1. Rock with 1,297 tracks\n2. Latin with 579 tracks\n3. Metal with 374 tracks"
}
}--output-fields# Extract all occurrences (default)
python generate_datasets.py --type single_step \
--project my-project \
--root-run-name LangGraph \
--run-name model \
--output /tmp/single_step.json
# Sample 2 occurrences per trace
python generate_datasets.py --type single_step \
--project my-project \
--root-run-name LangGraph \
--run-name model \
--sample-per-trace 2 \
--output /tmp/single_step_sampled.json
# Target specific tool at depth 2
python generate_datasets.py --type single_step \
--project my-project \
--root-run-name LangGraph \
--run-name sql_db_query \
--output /tmp/sql_query.json{
"trace_id": "...",
"run_id": "...",
"occurrence": 2,
"inputs": {
"messages": [
{"type": "human", "content": "What are the top 3 genres?"},
{"type": "ai", "content": "", "tool_calls": [...]},
{"type": "tool", "content": "...results..."},
...
]
},
"outputs": {
"expected_output": {
"messages": [
{"type": "ai", "content": "", "tool_calls": [...]}
]
},
"node_name": "model"
}
}occurrence--sample-per-trace--run-namemodeltools# Include all tool calls (all depths)
python generate_datasets.py --type trajectory \
--project my-project \
--root-run-name LangGraph \
--limit 30 \
--output /tmp/trajectory_all.json
# Only tool calls up to depth 2
python generate_datasets.py --type trajectory \
--project my-project \
--root-run-name LangGraph \
--depth 2 \
--output /tmp/trajectory_depth2.json
# Only root-level tool calls (depth 0) - usually empty if tools are at depth 2+
python generate_datasets.py --type trajectory \
--project my-project \
--depth 0 \
--output /tmp/trajectory_root.json{
"trace_id": "...",
"inputs": {"query": "What are the top 3 genres?"},
"outputs": {
"expected_trajectory": [
"sql_db_list_tables",
"sql_db_schema",
"sql_db_query_checker",
"sql_db_query"
]
}
}--depth--depth 2--depth 1--depth 0python generate_datasets.py --type rag \
--project my-project \
--limit 30 \
--output /tmp/rag_ds.csv # Supports .json or .csvquestion,retrieved_chunks,answer,cited_chunks
"How do I...","Chunk 1\n\nChunk 2","The answer is...","[\"Chunk 1\"]"# JSON output (default)
python generate_datasets.py --type trajectory --project my-project --output ds.json
# CSV output (use .csv extension)
python generate_datasets.py --type trajectory --project my-project --output ds.csv# Generate and upload in one command
python generate_datasets.py --type trajectory \
--project my-project \
--root-run-name LangGraph \
--limit 50 \
--output /tmp/trajectory_ds.json \
--upload "Skills: Trajectory"
# Use --replace to overwrite existing dataset
python generate_datasets.py --type final_response \
--project my-project \
--output /tmp/final.json \
--upload "Skills: Final Response" \
--replace# List all datasets
python query_datasets.py list-datasets
# Filter by name pattern
python query_datasets.py list-datasets | grep "Skills:"
# View dataset examples
python query_datasets.py show "Skills: Trajectory" --limit 5
# View local file
python query_datasets.py view-file /tmp/trajectory_ds.json --limit 3
# Analyze structure
python query_datasets.py structure /tmp/trajectory_ds.json
# Export from LangSmith to local
python query_datasets.py export "Skills: Final Response" /tmp/exported.json --limit 100--root-run-name--last-n-minutes 1440--sample-per-trace 2--depth 2query_datasets.py view-file--replace# 1. Generate fresh traces (if needed)
python tests/test_agent.py --batch # Your test agent
# 2. Generate all dataset types from LangGraph traces
python generate_datasets.py --type final_response \
--project skills --root-run-name LangGraph --limit 10 \
--output /tmp/final.json --upload "Skills: Final Response" --replace
python generate_datasets.py --type single_step \
--project skills --root-run-name LangGraph --run-name model \
--sample-per-trace 2 --limit 10 \
--output /tmp/model.json --upload "Skills: Single Step (model)" --replace
python generate_datasets.py --type trajectory \
--project skills --root-run-name LangGraph --limit 10 \
--output /tmp/traj.json --upload "Skills: Trajectory (all depths)" --replace
python generate_datasets.py --type trajectory \
--project skills --root-run-name LangGraph --depth 2 --limit 10 \
--output /tmp/traj_d2.json --upload "Skills: Trajectory (depth=2)" --replace
# 3. Review in LangSmith UI
# Visit https://smith.langchain.com → Datasets → Filter for "Skills:"
# 4. Query locally if needed
python query_datasets.py show "Skills: Final Response" --limit 3--root-run-name--messages-only--depth--depth 2python query_traces.py trace <id> --show-hierarchy--sample-per-trace 2--replace