Loading...
Loading...
LLM observability platform for tracing, evaluation, and monitoring. Use when debugging LLM applications, evaluating model outputs against datasets, monitoring production systems, or building systematic testing pipelines for AI applications.
npx skill4agent add orchestra-research/ai-research-skills langsmith-observabilitypip install langsmith
# Set environment variables
export LANGSMITH_API_KEY="your-api-key"
export LANGSMITH_TRACING=truefrom langsmith import traceable
from openai import OpenAI
client = OpenAI()
@traceable
def generate_response(prompt: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Automatically traced to LangSmith
result = generate_response("What is machine learning?")from langsmith.wrappers import wrap_openai
from openai import OpenAI
# Wrap client for automatic tracing
client = wrap_openai(OpenAI())
# All calls automatically traced
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello!"}]
)from langsmith import traceable
@traceable(run_type="chain")
def process_query(query: str) -> str:
# Parent run
context = retrieve_context(query) # Child run
response = generate_answer(query, context) # Child run
return response
@traceable(run_type="retriever")
def retrieve_context(query: str) -> list:
return vector_store.search(query)
@traceable(run_type="llm")
def generate_answer(query: str, context: list) -> str:
return llm.invoke(f"Context: {context}\n\nQuestion: {query}")import os
os.environ["LANGSMITH_PROJECT"] = "my-project"
# Or per-function
@traceable(project_name="my-project")
def my_function():
passfrom langsmith import Client
client = Client()
# List runs
runs = list(client.list_runs(
project_name="my-project",
filter='eq(status, "success")',
limit=100
))
# Get run details
run = client.read_run(run_id="...")
# Create feedback
client.create_feedback(
run_id="...",
key="correctness",
score=0.9,
comment="Good answer"
)from langsmith import Client
client = Client()
# Create dataset
dataset = client.create_dataset("qa-test-set", description="QA evaluation")
# Add examples
client.create_examples(
inputs=[
{"question": "What is Python?"},
{"question": "What is ML?"}
],
outputs=[
{"answer": "A programming language"},
{"answer": "Machine learning"}
],
dataset_id=dataset.id
)from langsmith import evaluate
def my_model(inputs: dict) -> dict:
# Your model logic
return {"answer": generate_answer(inputs["question"])}
def correctness_evaluator(run, example):
prediction = run.outputs["answer"]
reference = example.outputs["answer"]
score = 1.0 if reference.lower() in prediction.lower() else 0.0
return {"key": "correctness", "score": score}
results = evaluate(
my_model,
data="qa-test-set",
evaluators=[correctness_evaluator],
experiment_prefix="v1"
)
print(f"Average score: {results.aggregate_metrics['correctness']}")from langsmith.evaluation import LangChainStringEvaluator
# Use LangChain evaluators
results = evaluate(
my_model,
data="qa-test-set",
evaluators=[
LangChainStringEvaluator("qa"),
LangChainStringEvaluator("cot_qa")
]
)from langsmith import tracing_context
with tracing_context(
project_name="experiment-1",
tags=["production", "v2"],
metadata={"version": "2.0"}
):
# All traceable calls inherit context
result = my_function()from langsmith import trace
with trace(
name="custom_operation",
run_type="tool",
inputs={"query": "test"}
) as run:
result = do_something()
run.end(outputs={"result": result})def sanitize_inputs(inputs: dict) -> dict:
if "password" in inputs:
inputs["password"] = "***"
return inputs
@traceable(process_inputs=sanitize_inputs)
def login(username: str, password: str):
return authenticate(username, password)import os
os.environ["LANGSMITH_TRACING_SAMPLING_RATE"] = "0.1" # 10% samplingfrom langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# Tracing enabled automatically with LANGSMITH_TRACING=true
llm = ChatOpenAI(model="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant."),
("user", "{input}")
])
chain = prompt | llm
# All chain runs traced automatically
response = chain.invoke({"input": "Hello!"})from langsmith import Client
client = Client()
# Pull prompt from hub
prompt = client.pull_prompt("my-org/qa-prompt")
# Use in application
result = prompt.invoke({"question": "What is AI?"})from langsmith import AsyncClient
async def main():
client = AsyncClient()
runs = []
async for run in client.list_runs(project_name="my-project"):
runs.append(run)
return runsfrom langsmith import Client
client = Client()
# Collect user feedback
def record_feedback(run_id: str, user_rating: int, comment: str = None):
client.create_feedback(
run_id=run_id,
key="user_rating",
score=user_rating / 5.0, # Normalize to 0-1
comment=comment
)
# In your application
record_feedback(run_id="...", user_rating=4, comment="Helpful response")from langsmith import test
@test
def test_qa_accuracy():
result = my_qa_function("What is Python?")
assert "programming" in result.lower()from langsmith import evaluate
def run_evaluation():
results = evaluate(
my_model,
data="regression-test-set",
evaluators=[accuracy_evaluator]
)
# Fail CI if accuracy drops
assert results.aggregate_metrics["accuracy"] >= 0.9, \
f"Accuracy {results.aggregate_metrics['accuracy']} below threshold"import os
# Ensure tracing is enabled
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "your-key"
# Verify connection
from langsmith import Client
client = Client()
print(client.list_projects()) # Should work# Enable background batching (default)
from langsmith import Client
client = Client(auto_batch_tracing=True)
# Or use sampling
os.environ["LANGSMITH_TRACING_SAMPLING_RATE"] = "0.1"# Hide sensitive/large fields
@traceable(
process_inputs=lambda x: {k: v for k, v in x.items() if k != "large_field"}
)
def my_function(data):
pass