alert-management
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseAlert Management
告警管理
Overview
概述
Design and implement sophisticated alert management systems with PagerDuty integration, escalation policies, alert routing, and incident coordination.
设计并实现集成PagerDuty的复杂告警管理系统,包含升级策略、告警路由和事件协调功能。
When to Use
适用场景
- Setting up alert routing
- Managing on-call schedules
- Coordinating incident response
- Creating escalation policies
- Integrating alerting systems
- 设置告警路由
- 管理随叫随到排班
- 协调事件响应
- 创建升级策略
- 集成告警系统
Instructions
操作步骤
1. PagerDuty Client Integration
1. PagerDuty客户端集成
javascript
// pagerduty-client.js
const axios = require('axios');
class PagerDutyClient {
constructor(apiToken) {
this.apiToken = apiToken;
this.baseUrl = 'https://api.pagerduty.com';
this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';
this.client = axios.create({
baseURL: this.baseUrl,
headers: {
'Authorization': `Token token=${apiToken}`,
'Accept': 'application/vnd.pagerduty+json;version=2'
}
});
}
async triggerEvent(config) {
const event = {
routing_key: config.routingKey,
event_action: config.eventAction || 'trigger',
dedup_key: config.dedupKey || `event-${Date.now()}`,
payload: {
summary: config.summary,
timestamp: new Date().toISOString(),
severity: config.severity || 'error',
source: config.source || 'Monitoring System',
component: config.component,
custom_details: config.customDetails || {}
}
};
try {
const response = await axios.post(this.eventUrl, event);
return response.data;
} catch (error) {
console.error('Failed to trigger PagerDuty event:', error);
throw error;
}
}
async resolveEvent(dedupKey) {
const event = {
routing_key: process.env.PAGERDUTY_ROUTING_KEY,
event_action: 'resolve',
dedup_key: dedupKey
};
try {
return await axios.post(this.eventUrl, event);
} catch (error) {
console.error('Failed to resolve event:', error);
throw error;
}
}
async getServices() {
const response = await this.client.get('/services');
return response.data.services;
}
async getEscalationPolicies() {
const response = await this.client.get('/escalation_policies');
return response.data.escalation_policies;
}
async createIncident(config) {
const incident = {
type: 'incident',
title: config.title,
service: {
id: config.serviceId,
type: 'service_reference'
},
escalation_policy: {
id: config.escalationPolicyId,
type: 'escalation_policy_reference'
},
body: {
type: 'incident_body',
details: config.details || ''
}
};
try {
const response = await this.client.post('/incidents', incident, {
headers: { 'From': process.env.PAGERDUTY_EMAIL }
});
return response.data.incident;
} catch (error) {
console.error('Failed to create incident:', error);
throw error;
}
}
async acknowledgeIncident(incidentId, userId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [{
id: incidentId,
type: 'incident_reference',
status: 'acknowledged'
}]
},
{ headers: { 'From': process.env.PAGERDUTY_EMAIL } }
);
return response.data.incidents[0];
} catch (error) {
console.error('Failed to acknowledge:', error);
throw error;
}
}
async resolveIncident(incidentId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [{
id: incidentId,
type: 'incident_reference',
status: 'resolved'
}]
},
{ headers: { 'From': process.env.PAGERDUTY_EMAIL } }
);
return response.data.incidents[0];
} catch (error) {
console.error('Failed to resolve:', error);
throw error;
}
}
}
module.exports = PagerDutyClient;javascript
// pagerduty-client.js
const axios = require('axios');
class PagerDutyClient {
constructor(apiToken) {
this.apiToken = apiToken;
this.baseUrl = 'https://api.pagerduty.com';
this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';
this.client = axios.create({
baseURL: this.baseUrl,
headers: {
'Authorization': `Token token=${apiToken}`,
'Accept': 'application/vnd.pagerduty+json;version=2'
}
});
}
async triggerEvent(config) {
const event = {
routing_key: config.routingKey,
event_action: config.eventAction || 'trigger',
dedup_key: config.dedupKey || `event-${Date.now()}`,
payload: {
summary: config.summary,
timestamp: new Date().toISOString(),
severity: config.severity || 'error',
source: config.source || 'Monitoring System',
component: config.component,
custom_details: config.customDetails || {}
}
};
try {
const response = await axios.post(this.eventUrl, event);
return response.data;
} catch (error) {
console.error('Failed to trigger PagerDuty event:', error);
throw error;
}
}
async resolveEvent(dedupKey) {
const event = {
routing_key: process.env.PAGERDUTY_ROUTING_KEY,
event_action: 'resolve',
dedup_key: dedupKey
};
try {
return await axios.post(this.eventUrl, event);
} catch (error) {
console.error('Failed to resolve event:', error);
throw error;
}
}
async getServices() {
const response = await this.client.get('/services');
return response.data.services;
}
async getEscalationPolicies() {
const response = await this.client.get('/escalation_policies');
return response.data.escalation_policies;
}
async createIncident(config) {
const incident = {
type: 'incident',
title: config.title,
service: {
id: config.serviceId,
type: 'service_reference'
},
escalation_policy: {
id: config.escalationPolicyId,
type: 'escalation_policy_reference'
},
body: {
type: 'incident_body',
details: config.details || ''
}
};
try {
const response = await this.client.post('/incidents', incident, {
headers: { 'From': process.env.PAGERDUTY_EMAIL }
});
return response.data.incident;
} catch (error) {
console.error('Failed to create incident:', error);
throw error;
}
}
async acknowledgeIncident(incidentId, userId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [{
id: incidentId,
type: 'incident_reference',
status: 'acknowledged'
}]
},
{ headers: { 'From': process.env.PAGERDUTY_EMAIL } }
);
return response.data.incidents[0];
} catch (error) {
console.error('Failed to acknowledge:', error);
throw error;
}
}
async resolveIncident(incidentId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [{
id: incidentId,
type: 'incident_reference',
status: 'resolved'
}]
},
{ headers: { 'From': process.env.PAGERDUTY_EMAIL } }
);
return response.data.incidents[0];
} catch (error) {
console.error('Failed to resolve:', error);
throw error;
}
}
}
module.exports = PagerDutyClient;2. Alertmanager Configuration
2. Alertmanager配置
yaml
undefinedyaml
undefined/etc/alertmanager/alertmanager.yml
/etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: '${SLACK_WEBHOOK_URL}'
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: pagerduty
continue: true
group_wait: 0s
- match:
severity: warning
receiver: slack
- match:
service: payment-service
receiver: payment-team
group_wait: 30sreceivers:
-
name: 'default' slack_configs:
- channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}'
-
name: 'pagerduty' pagerduty_configs:
- service_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}'
-
name: 'slack' slack_configs:
- channel: '#alerts' title: 'Warning: {{ .GroupLabels.alertname }}'
-
name: 'payment-team' pagerduty_configs:
- service_key: '${PAYMENT_PAGERDUTY_KEY}' slack_configs:
- channel: '#payment-alerts'
inhibit_rules:
- source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'service']
undefinedglobal:
resolve_timeout: 5m
slack_api_url: '${SLACK_WEBHOOK_URL}'
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: pagerduty
continue: true
group_wait: 0s
- match:
severity: warning
receiver: slack
- match:
service: payment-service
receiver: payment-team
group_wait: 30sreceivers:
-
name: 'default' slack_configs:
- channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}'
-
name: 'pagerduty' pagerduty_configs:
- service_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}'
-
name: 'slack' slack_configs:
- channel: '#alerts' title: 'Warning: {{ .GroupLabels.alertname }}'
-
name: 'payment-team' pagerduty_configs:
- service_key: '${PAYMENT_PAGERDUTY_KEY}' slack_configs:
- channel: '#payment-alerts'
inhibit_rules:
- source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'service']
undefined3. Alert Handler Middleware
3. 告警处理中间件
javascript
// alert-handler.js
const PagerDutyClient = require('./pagerduty-client');
const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);
class AlertHandler {
constructor() {
this.alertCache = new Map();
this.deduplicationWindow = 300000; // 5 minutes
}
shouldSendAlert(dedupKey) {
const cacheEntry = this.alertCache.get(dedupKey);
if (!cacheEntry) return true;
const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
return timeSinceLastAlert >= this.deduplicationWindow;
}
recordAlert(dedupKey) {
this.alertCache.set(dedupKey, { timestamp: Date.now() });
}
determineSeverity(value, thresholds) {
if (value >= thresholds.critical) return 'critical';
if (value >= thresholds.warning) return 'warning';
return 'info';
}
async sendAlert(config) {
const dedupKey = config.dedupKey || `alert-${config.alertName}-${Date.now()}`;
try {
if (!this.shouldSendAlert(dedupKey)) {
console.log('Alert recently sent, skipping');
return;
}
const event = {
routingKey: config.routingKey,
eventAction: config.eventAction || 'trigger',
dedupKey: dedupKey,
summary: config.summary,
severity: config.severity,
source: config.source || 'Monitoring System',
component: config.component,
customDetails: {
...config.customDetails,
alertName: config.alertName,
timestamp: new Date().toISOString()
}
};
const result = await pdClient.triggerEvent(event);
this.recordAlert(dedupKey);
console.log('Alert sent', {
alertName: config.alertName,
severity: config.severity
});
return result;
} catch (error) {
console.error('Failed to send alert:', error);
await this.sendSlackAlert(config);
}
}
async sendSlackAlert(config) {
const axios = require('axios');
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
const message = {
color: config.severity === 'critical' ? 'danger' : 'warning',
title: config.summary,
text: config.customDetails?.description || '',
fields: [
{ title: 'Severity', value: config.severity, short: true },
{ title: 'Component', value: config.component, short: true }
]
};
try {
await axios.post(webhookUrl, { attachments: [message] });
} catch (error) {
console.error('Failed to send Slack alert:', error);
}
}
async resolveAlert(dedupKey) {
try {
await pdClient.resolveEvent(dedupKey);
console.log('Alert resolved');
} catch (error) {
console.error('Failed to resolve alert:', error);
}
}
}
module.exports = new AlertHandler();javascript
// alert-handler.js
const PagerDutyClient = require('./pagerduty-client');
const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);
class AlertHandler {
constructor() {
this.alertCache = new Map();
this.deduplicationWindow = 300000; // 5 minutes
}
shouldSendAlert(dedupKey) {
const cacheEntry = this.alertCache.get(dedupKey);
if (!cacheEntry) return true;
const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
return timeSinceLastAlert >= this.deduplicationWindow;
}
recordAlert(dedupKey) {
this.alertCache.set(dedupKey, { timestamp: Date.now() });
}
determineSeverity(value, thresholds) {
if (value >= thresholds.critical) return 'critical';
if (value >= thresholds.warning) return 'warning';
return 'info';
}
async sendAlert(config) {
const dedupKey = config.dedupKey || `alert-${config.alertName}-${Date.now()}`;
try {
if (!this.shouldSendAlert(dedupKey)) {
console.log('Alert recently sent, skipping');
return;
}
const event = {
routingKey: config.routingKey,
eventAction: config.eventAction || 'trigger',
dedupKey: dedupKey,
summary: config.summary,
severity: config.severity,
source: config.source || 'Monitoring System',
component: config.component,
customDetails: {
...config.customDetails,
alertName: config.alertName,
timestamp: new Date().toISOString()
}
};
const result = await pdClient.triggerEvent(event);
this.recordAlert(dedupKey);
console.log('Alert sent', {
alertName: config.alertName,
severity: config.severity
});
return result;
} catch (error) {
console.error('Failed to send alert:', error);
await this.sendSlackAlert(config);
}
}
async sendSlackAlert(config) {
const axios = require('axios');
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
const message = {
color: config.severity === 'critical' ? 'danger' : 'warning',
title: config.summary,
text: config.customDetails?.description || '',
fields: [
{ title: 'Severity', value: config.severity, short: true },
{ title: 'Component', value: config.component, short: true }
]
};
try {
await axios.post(webhookUrl, { attachments: [message] });
} catch (error) {
console.error('Failed to send Slack alert:', error);
}
}
async resolveAlert(dedupKey) {
try {
await pdClient.resolveEvent(dedupKey);
console.log('Alert resolved');
} catch (error) {
console.error('Failed to resolve alert:', error);
}
}
}
module.exports = new AlertHandler();4. Alert Routing Engine
4. 告警路由引擎
javascript
// alert-router.js
class AlertRouter {
constructor() {
this.routes = [];
}
addRoute(rule) {
this.routes.push({
priority: rule.priority || 0,
condition: rule.condition,
handler: rule.handler,
escalation: rule.escalation
});
this.routes.sort((a, b) => b.priority - a.priority);
}
async route(alert) {
for (const route of this.routes) {
if (route.condition(alert)) {
return await route.handler(alert, route.escalation);
}
}
return this.defaultHandler(alert);
}
async defaultHandler(alert) {
console.log('Routing to default handler:', alert.name);
return { routed: true, handler: 'default' };
}
}
// Usage
const router = new AlertRouter();
router.addRoute({
priority: 100,
condition: (alert) => alert.severity === 'critical' && alert.component === 'database',
handler: async (alert) => {
console.log('Routing critical database alert to DBA team');
return { team: 'dba', escalation: 'immediate' };
}
});
router.addRoute({
priority: 90,
condition: (alert) => alert.component === 'payment-service',
handler: async (alert) => {
console.log('Routing to payment team');
return { team: 'payment', escalation: 'payment-policy' };
}
});
router.addRoute({
priority: 10,
condition: (alert) => alert.severity === 'warning',
handler: async (alert) => {
console.log('Routing warning to Slack');
return { handler: 'slack-only' };
}
});
module.exports = router;javascript
// alert-router.js
class AlertRouter {
constructor() {
this.routes = [];
}
addRoute(rule) {
this.routes.push({
priority: rule.priority || 0,
condition: rule.condition,
handler: rule.handler,
escalation: rule.escalation
});
this.routes.sort((a, b) => b.priority - a.priority);
}
async route(alert) {
for (const route of this.routes) {
if (route.condition(alert)) {
return await route.handler(alert, route.escalation);
}
}
return this.defaultHandler(alert);
}
async defaultHandler(alert) {
console.log('Routing to default handler:', alert.name);
return { routed: true, handler: 'default' };
}
}
// Usage
const router = new AlertRouter();
router.addRoute({
priority: 100,
condition: (alert) => alert.severity === 'critical' && alert.component === 'database',
handler: async (alert) => {
console.log('Routing critical database alert to DBA team');
return { team: 'dba', escalation: 'immediate' };
}
});
router.addRoute({
priority: 90,
condition: (alert) => alert.component === 'payment-service',
handler: async (alert) => {
console.log('Routing to payment team');
return { team: 'payment', escalation: 'payment-policy' };
}
});
router.addRoute({
priority: 10,
condition: (alert) => alert.severity === 'warning',
handler: async (alert) => {
console.log('Routing warning to Slack');
return { handler: 'slack-only' };
}
});
module.exports = router;5. Docker Compose Alert Stack
5. Docker Compose告警栈
yaml
undefinedyaml
undefineddocker-compose.yml
docker-compose.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
environment:
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY}
depends_on:
- prometheus
alert-handler:
build: .
environment:
PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
ports:
- "3000:3000"
depends_on:
- alertmanager
undefinedversion: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
environment:
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY}
depends_on:
- prometheus
alert-handler:
build: .
environment:
PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
ports:
- "3000:3000"
depends_on:
- alertmanager
undefinedBest Practices
最佳实践
✅ DO
✅ 建议
- Set appropriate thresholds
- Implement alert deduplication
- Use clear alert names
- Include runbook links
- Configure escalation properly
- Test alert rules
- Monitor alert quality
- Set repeat intervals
- Track alert metrics
- Document alert meanings
- 设置合适的阈值
- 实现告警去重
- 使用清晰的告警名称
- 包含运行手册链接
- 正确配置升级策略
- 测试告警规则
- 监控告警质量
- 设置重复告警间隔
- 跟踪告警指标
- 文档化告警含义
❌ DON'T
❌ 不建议
- Alert on every anomaly
- Ignore alert fatigue
- Set thresholds arbitrarily
- Skip runbooks
- Alert without action
- Disable alerts in production
- Use vague alert names
- Forget escalation policies
- Re-alert too frequently
- 对每个异常都发送告警
- 忽略告警疲劳
- 随意设置阈值
- 跳过运行手册
- 发送无操作指引的告警
- 在生产环境中禁用告警
- 使用模糊的告警名称
- 忘记配置升级策略
- 过于频繁地重复告警
Alert Severity Levels
告警严重级别
- Critical: Immediate action required, customer impact
- Warning: Investigation needed, potential issues
- Info: Informational, no action required
- Critical(严重):需立即处理,已影响客户
- Warning(警告):需调查,存在潜在问题
- Info(信息):仅作通知,无需操作
Key Metrics
关键指标
- Alert volume
- Resolution time
- False positive rate
- Escalation frequency
- MTTD (Mean Time to Detection)
- MTTR (Mean Time to Resolution)
- 告警数量
- 解决时间
- 误报率
- 升级频率
- MTTD(平均检测时间)
- MTTR(平均恢复时间)