Loading...
Loading...
LLM and ML model deployment for inference. Use when serving models in production, building AI APIs, or optimizing inference. Covers vLLM (LLM serving), TensorRT-LLM (GPU optimization), Ollama (local), BentoML (ML deployment), Triton (multi-model), LangChain (orchestration), LlamaIndex (RAG), and streaming patterns.
npx skill4agent add ancoleman/ai-design-components model-servingSelf-hosted LLM deployment needed?
├─ Yes, need maximum throughput → vLLM
├─ Yes, need absolute max GPU efficiency → TensorRT-LLM
├─ Yes, local development only → Ollama
└─ No, use managed API (OpenAI, Anthropic) → No serving layer needed# Install
pip install vllm
# Serve a model (OpenAI-compatible API)
vllm serve meta-llama/Llama-3.1-8B-Instruct \
--dtype auto \
--max-model-len 4096 \
--gpu-memory-utilization 0.9 \
--port 8000--dtype--max-model-len--gpu-memory-utilization--tensor-parallel-sizefrom fastapi import FastAPI
from fastapi.responses import StreamingResponse
from openai import OpenAI
import json
app = FastAPI()
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
@app.post("/chat/stream")
async def chat_stream(message: str):
async def generate():
stream = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{"role": "user", "content": message}],
stream=True,
max_tokens=512
)
for chunk in stream:
if chunk.choices[0].delta.content:
token = chunk.choices[0].delta.content
yield f"data: {json.dumps({'token': token})}\n\n"
yield f"data: {json.dumps({'done': True})}\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache"}
)// Integration with ai-chat skill
const sendMessage = async (message: string) => {
const response = await fetch('/chat/stream', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ message })
})
const reader = response.body!.getReader()
const decoder = new TextDecoder()
while (true) {
const { done, value } = await reader.read()
if (done) break
const chunk = decoder.decode(value)
const lines = chunk.split('\n\n')
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = JSON.parse(line.slice(6))
if (data.token) {
setResponse(prev => prev + data.token)
}
}
}
}
}import bentoml
from bentoml.io import JSON
import numpy as np
@bentoml.service(
resources={"cpu": "2", "memory": "4Gi"},
traffic={"timeout": 10}
)
class IrisClassifier:
model_ref = bentoml.models.get("iris_classifier:latest")
def __init__(self):
self.model = bentoml.sklearn.load_model(self.model_ref)
@bentoml.api(batchable=True, max_batch_size=32)
def classify(self, features: list[dict]) -> list[str]:
X = np.array([[f['sepal_length'], f['sepal_width'],
f['petal_length'], f['petal_width']] for f in features])
predictions = self.model.predict(X)
return ['setosa', 'versicolor', 'virginica'][predictions]from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load and chunk documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
# Create vector store
embeddings = OpenAIEmbeddings()
vectorstore = Qdrant.from_documents(
chunks,
embeddings,
url="http://localhost:6333",
collection_name="docs"
)
# Create retrieval chain
llm = ChatOpenAI(model="gpt-4o")
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True
)
# Query
result = qa_chain({"query": "What is PagedAttention?"})GPU Memory (GB) = Model Parameters (B) × Precision (bytes) × 1.2# Enable quantization (AWQ for 4-bit)
vllm serve TheBloke/Llama-3.1-8B-AWQ \
--quantization awq \
--gpu-memory-utilization 0.9
# Multi-GPU deployment (tensor parallelism)
vllm serve meta-llama/Llama-3.1-70B-Instruct \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.9@bentoml.api(
batchable=True,
max_batch_size=32,
max_latency_ms=1000 # Wait max 1s to fill batch
)
def predict(self, inputs: list[np.ndarray]) -> list[float]:
# BentoML automatically batches requests
return self.model.predict(np.array(inputs))examples/k8s-vllm-deployment/nvidia.com/gpu: 1/healthservices:
- name: vllm-service
url: http://vllm-llama-8b:8000
plugins:
- name: rate-limiting
config:
minute: 60 # 60 requests per minute per API key
- name: key-auth
- name: prometheusfrom prometheus_client import Counter, Histogram
requests_total = Counter('llm_requests_total', 'Total requests')
tokens_generated = Counter('llm_tokens_generated', 'Total tokens')
request_duration = Histogram('llm_request_duration_seconds', 'Request duration')
@app.post("/chat")
async def chat(request):
requests_total.inc()
start = time.time()
response = await generate(request)
tokens_generated.inc(len(response.tokens))
request_duration.observe(time.time() - start)
return responseai-chatFrontend (React) → API Gateway → vLLM Server → GPU Inference
↑ ↓
└─────────── SSE Stream (tokens) ─────────────────┘references/streaming-sse.mdUser Query → LangChain
├─> Vector DB (Qdrant) for retrieval
├─> Combine context + query
└─> LLM (vLLM) for generationreferences/langchain-orchestration.mdexamples/langchain-rag-qdrant/Client → API → Message Queue (Celery) → Workers (vLLM) → Results DBscripts/benchmark_inference.pypython scripts/benchmark_inference.py \
--endpoint http://localhost:8000/v1/chat/completions \
--model meta-llama/Llama-3.1-8B-Instruct \
--concurrency 32 \
--requests 1000references/vllm.mdreferences/tgi.mdreferences/bentoml.mdreferences/langchain-orchestration.mdreferences/inference-optimization.mdexamples/vllm-serving/examples/ollama-local/examples/langchain-agents/scripts/benchmark_inference.pyscripts/validate_model_config.py# Before (OpenAI)
from openai import OpenAI
client = OpenAI(api_key="sk-...")
# After (vLLM)
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed"
)
# Same API calls work!
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{"role": "user", "content": "Hello"}]
)MODEL_ROUTING = {
"small": "meta-llama/Llama-3.1-8B-Instruct", # Fast, cheap
"large": "meta-llama/Llama-3.1-70B-Instruct", # Accurate, expensive
"code": "codellama/CodeLlama-34b-Instruct" # Code-specific
}
@app.post("/chat")
async def chat(message: str, task: str = "small"):
model = MODEL_ROUTING[task]
# Route to appropriate vLLM instanceimport tiktoken
def estimate_cost(text: str, model: str, price_per_1k: float):
encoding = tiktoken.encoding_for_model(model)
tokens = len(encoding.encode(text))
return (tokens / 1000) * price_per_1k
# Compare costs
openai_cost = estimate_cost(text, "gpt-4o", 0.005) # $5 per 1M tokens
self_hosted_cost = 0 # Fixed GPU cost, unlimited tokens--max-model-len--gpu-memory-utilization--quantization awq--gpu-memory-utilizationscripts/benchmark_inference.pyexamples/ollama-local/examples/vllm-serving/examples/langchain-rag-qdrant/examples/k8s-vllm-deployment/