alert-management

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

Alert Management

告警管理

Overview

概述

Design and implement sophisticated alert management systems with PagerDuty integration, escalation policies, alert routing, and incident coordination.
设计并实现集成PagerDuty的复杂告警管理系统,包含升级策略、告警路由和事件协调功能。

When to Use

适用场景

  • Setting up alert routing
  • Managing on-call schedules
  • Coordinating incident response
  • Creating escalation policies
  • Integrating alerting systems
  • 设置告警路由
  • 管理随叫随到排班
  • 协调事件响应
  • 创建升级策略
  • 集成告警系统

Instructions

操作步骤

1. PagerDuty Client Integration

1. PagerDuty客户端集成

javascript
// pagerduty-client.js
const axios = require('axios');

class PagerDutyClient {
  constructor(apiToken) {
    this.apiToken = apiToken;
    this.baseUrl = 'https://api.pagerduty.com';
    this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';

    this.client = axios.create({
      baseURL: this.baseUrl,
      headers: {
        'Authorization': `Token token=${apiToken}`,
        'Accept': 'application/vnd.pagerduty+json;version=2'
      }
    });
  }

  async triggerEvent(config) {
    const event = {
      routing_key: config.routingKey,
      event_action: config.eventAction || 'trigger',
      dedup_key: config.dedupKey || `event-${Date.now()}`,
      payload: {
        summary: config.summary,
        timestamp: new Date().toISOString(),
        severity: config.severity || 'error',
        source: config.source || 'Monitoring System',
        component: config.component,
        custom_details: config.customDetails || {}
      }
    };

    try {
      const response = await axios.post(this.eventUrl, event);
      return response.data;
    } catch (error) {
      console.error('Failed to trigger PagerDuty event:', error);
      throw error;
    }
  }

  async resolveEvent(dedupKey) {
    const event = {
      routing_key: process.env.PAGERDUTY_ROUTING_KEY,
      event_action: 'resolve',
      dedup_key: dedupKey
    };

    try {
      return await axios.post(this.eventUrl, event);
    } catch (error) {
      console.error('Failed to resolve event:', error);
      throw error;
    }
  }

  async getServices() {
    const response = await this.client.get('/services');
    return response.data.services;
  }

  async getEscalationPolicies() {
    const response = await this.client.get('/escalation_policies');
    return response.data.escalation_policies;
  }

  async createIncident(config) {
    const incident = {
      type: 'incident',
      title: config.title,
      service: {
        id: config.serviceId,
        type: 'service_reference'
      },
      escalation_policy: {
        id: config.escalationPolicyId,
        type: 'escalation_policy_reference'
      },
      body: {
        type: 'incident_body',
        details: config.details || ''
      }
    };

    try {
      const response = await this.client.post('/incidents', incident, {
        headers: { 'From': process.env.PAGERDUTY_EMAIL }
      });
      return response.data.incident;
    } catch (error) {
      console.error('Failed to create incident:', error);
      throw error;
    }
  }

  async acknowledgeIncident(incidentId, userId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'acknowledged'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to acknowledge:', error);
      throw error;
    }
  }

  async resolveIncident(incidentId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'resolved'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to resolve:', error);
      throw error;
    }
  }
}

module.exports = PagerDutyClient;
javascript
// pagerduty-client.js
const axios = require('axios');

class PagerDutyClient {
  constructor(apiToken) {
    this.apiToken = apiToken;
    this.baseUrl = 'https://api.pagerduty.com';
    this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';

    this.client = axios.create({
      baseURL: this.baseUrl,
      headers: {
        'Authorization': `Token token=${apiToken}`,
        'Accept': 'application/vnd.pagerduty+json;version=2'
      }
    });
  }

  async triggerEvent(config) {
    const event = {
      routing_key: config.routingKey,
      event_action: config.eventAction || 'trigger',
      dedup_key: config.dedupKey || `event-${Date.now()}`,
      payload: {
        summary: config.summary,
        timestamp: new Date().toISOString(),
        severity: config.severity || 'error',
        source: config.source || 'Monitoring System',
        component: config.component,
        custom_details: config.customDetails || {}
      }
    };

    try {
      const response = await axios.post(this.eventUrl, event);
      return response.data;
    } catch (error) {
      console.error('Failed to trigger PagerDuty event:', error);
      throw error;
    }
  }

  async resolveEvent(dedupKey) {
    const event = {
      routing_key: process.env.PAGERDUTY_ROUTING_KEY,
      event_action: 'resolve',
      dedup_key: dedupKey
    };

    try {
      return await axios.post(this.eventUrl, event);
    } catch (error) {
      console.error('Failed to resolve event:', error);
      throw error;
    }
  }

  async getServices() {
    const response = await this.client.get('/services');
    return response.data.services;
  }

  async getEscalationPolicies() {
    const response = await this.client.get('/escalation_policies');
    return response.data.escalation_policies;
  }

  async createIncident(config) {
    const incident = {
      type: 'incident',
      title: config.title,
      service: {
        id: config.serviceId,
        type: 'service_reference'
      },
      escalation_policy: {
        id: config.escalationPolicyId,
        type: 'escalation_policy_reference'
      },
      body: {
        type: 'incident_body',
        details: config.details || ''
      }
    };

    try {
      const response = await this.client.post('/incidents', incident, {
        headers: { 'From': process.env.PAGERDUTY_EMAIL }
      });
      return response.data.incident;
    } catch (error) {
      console.error('Failed to create incident:', error);
      throw error;
    }
  }

  async acknowledgeIncident(incidentId, userId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'acknowledged'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to acknowledge:', error);
      throw error;
    }
  }

  async resolveIncident(incidentId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'resolved'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to resolve:', error);
      throw error;
    }
  }
}

module.exports = PagerDutyClient;

2. Alertmanager Configuration

2. Alertmanager配置

yaml
undefined
yaml
undefined

/etc/alertmanager/alertmanager.yml

/etc/alertmanager/alertmanager.yml

global: resolve_timeout: 5m slack_api_url: '${SLACK_WEBHOOK_URL}'
templates:
  • '/etc/alertmanager/templates/*.tmpl'
route: receiver: 'default' group_by: ['alertname', 'cluster', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 4h
routes: - match: severity: critical receiver: pagerduty continue: true group_wait: 0s
- match:
    severity: warning
  receiver: slack

- match:
    service: payment-service
  receiver: payment-team
  group_wait: 30s
receivers:
  • name: 'default' slack_configs:
    • channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}'
  • name: 'pagerduty' pagerduty_configs:
    • service_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}'
  • name: 'slack' slack_configs:
    • channel: '#alerts' title: 'Warning: {{ .GroupLabels.alertname }}'
  • name: 'payment-team' pagerduty_configs:
    • service_key: '${PAYMENT_PAGERDUTY_KEY}' slack_configs:
    • channel: '#payment-alerts'
inhibit_rules:
  • source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'service']
undefined
global: resolve_timeout: 5m slack_api_url: '${SLACK_WEBHOOK_URL}'
templates:
  • '/etc/alertmanager/templates/*.tmpl'
route: receiver: 'default' group_by: ['alertname', 'cluster', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 4h
routes: - match: severity: critical receiver: pagerduty continue: true group_wait: 0s
- match:
    severity: warning
  receiver: slack

- match:
    service: payment-service
  receiver: payment-team
  group_wait: 30s
receivers:
  • name: 'default' slack_configs:
    • channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}'
  • name: 'pagerduty' pagerduty_configs:
    • service_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}'
  • name: 'slack' slack_configs:
    • channel: '#alerts' title: 'Warning: {{ .GroupLabels.alertname }}'
  • name: 'payment-team' pagerduty_configs:
    • service_key: '${PAYMENT_PAGERDUTY_KEY}' slack_configs:
    • channel: '#payment-alerts'
inhibit_rules:
  • source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'service']
undefined

3. Alert Handler Middleware

3. 告警处理中间件

javascript
// alert-handler.js
const PagerDutyClient = require('./pagerduty-client');

const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);

class AlertHandler {
  constructor() {
    this.alertCache = new Map();
    this.deduplicationWindow = 300000; // 5 minutes
  }

  shouldSendAlert(dedupKey) {
    const cacheEntry = this.alertCache.get(dedupKey);

    if (!cacheEntry) return true;

    const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
    return timeSinceLastAlert >= this.deduplicationWindow;
  }

  recordAlert(dedupKey) {
    this.alertCache.set(dedupKey, { timestamp: Date.now() });
  }

  determineSeverity(value, thresholds) {
    if (value >= thresholds.critical) return 'critical';
    if (value >= thresholds.warning) return 'warning';
    return 'info';
  }

  async sendAlert(config) {
    const dedupKey = config.dedupKey || `alert-${config.alertName}-${Date.now()}`;

    try {
      if (!this.shouldSendAlert(dedupKey)) {
        console.log('Alert recently sent, skipping');
        return;
      }

      const event = {
        routingKey: config.routingKey,
        eventAction: config.eventAction || 'trigger',
        dedupKey: dedupKey,
        summary: config.summary,
        severity: config.severity,
        source: config.source || 'Monitoring System',
        component: config.component,
        customDetails: {
          ...config.customDetails,
          alertName: config.alertName,
          timestamp: new Date().toISOString()
        }
      };

      const result = await pdClient.triggerEvent(event);
      this.recordAlert(dedupKey);

      console.log('Alert sent', {
        alertName: config.alertName,
        severity: config.severity
      });

      return result;
    } catch (error) {
      console.error('Failed to send alert:', error);
      await this.sendSlackAlert(config);
    }
  }

  async sendSlackAlert(config) {
    const axios = require('axios');
    const webhookUrl = process.env.SLACK_WEBHOOK_URL;

    const message = {
      color: config.severity === 'critical' ? 'danger' : 'warning',
      title: config.summary,
      text: config.customDetails?.description || '',
      fields: [
        { title: 'Severity', value: config.severity, short: true },
        { title: 'Component', value: config.component, short: true }
      ]
    };

    try {
      await axios.post(webhookUrl, { attachments: [message] });
    } catch (error) {
      console.error('Failed to send Slack alert:', error);
    }
  }

  async resolveAlert(dedupKey) {
    try {
      await pdClient.resolveEvent(dedupKey);
      console.log('Alert resolved');
    } catch (error) {
      console.error('Failed to resolve alert:', error);
    }
  }
}

module.exports = new AlertHandler();
javascript
// alert-handler.js
const PagerDutyClient = require('./pagerduty-client');

const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);

class AlertHandler {
  constructor() {
    this.alertCache = new Map();
    this.deduplicationWindow = 300000; // 5 minutes
  }

  shouldSendAlert(dedupKey) {
    const cacheEntry = this.alertCache.get(dedupKey);

    if (!cacheEntry) return true;

    const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
    return timeSinceLastAlert >= this.deduplicationWindow;
  }

  recordAlert(dedupKey) {
    this.alertCache.set(dedupKey, { timestamp: Date.now() });
  }

  determineSeverity(value, thresholds) {
    if (value >= thresholds.critical) return 'critical';
    if (value >= thresholds.warning) return 'warning';
    return 'info';
  }

  async sendAlert(config) {
    const dedupKey = config.dedupKey || `alert-${config.alertName}-${Date.now()}`;

    try {
      if (!this.shouldSendAlert(dedupKey)) {
        console.log('Alert recently sent, skipping');
        return;
      }

      const event = {
        routingKey: config.routingKey,
        eventAction: config.eventAction || 'trigger',
        dedupKey: dedupKey,
        summary: config.summary,
        severity: config.severity,
        source: config.source || 'Monitoring System',
        component: config.component,
        customDetails: {
          ...config.customDetails,
          alertName: config.alertName,
          timestamp: new Date().toISOString()
        }
      };

      const result = await pdClient.triggerEvent(event);
      this.recordAlert(dedupKey);

      console.log('Alert sent', {
        alertName: config.alertName,
        severity: config.severity
      });

      return result;
    } catch (error) {
      console.error('Failed to send alert:', error);
      await this.sendSlackAlert(config);
    }
  }

  async sendSlackAlert(config) {
    const axios = require('axios');
    const webhookUrl = process.env.SLACK_WEBHOOK_URL;

    const message = {
      color: config.severity === 'critical' ? 'danger' : 'warning',
      title: config.summary,
      text: config.customDetails?.description || '',
      fields: [
        { title: 'Severity', value: config.severity, short: true },
        { title: 'Component', value: config.component, short: true }
      ]
    };

    try {
      await axios.post(webhookUrl, { attachments: [message] });
    } catch (error) {
      console.error('Failed to send Slack alert:', error);
    }
  }

  async resolveAlert(dedupKey) {
    try {
      await pdClient.resolveEvent(dedupKey);
      console.log('Alert resolved');
    } catch (error) {
      console.error('Failed to resolve alert:', error);
    }
  }
}

module.exports = new AlertHandler();

4. Alert Routing Engine

4. 告警路由引擎

javascript
// alert-router.js
class AlertRouter {
  constructor() {
    this.routes = [];
  }

  addRoute(rule) {
    this.routes.push({
      priority: rule.priority || 0,
      condition: rule.condition,
      handler: rule.handler,
      escalation: rule.escalation
    });
    this.routes.sort((a, b) => b.priority - a.priority);
  }

  async route(alert) {
    for (const route of this.routes) {
      if (route.condition(alert)) {
        return await route.handler(alert, route.escalation);
      }
    }
    return this.defaultHandler(alert);
  }

  async defaultHandler(alert) {
    console.log('Routing to default handler:', alert.name);
    return { routed: true, handler: 'default' };
  }
}

// Usage
const router = new AlertRouter();

router.addRoute({
  priority: 100,
  condition: (alert) => alert.severity === 'critical' && alert.component === 'database',
  handler: async (alert) => {
    console.log('Routing critical database alert to DBA team');
    return { team: 'dba', escalation: 'immediate' };
  }
});

router.addRoute({
  priority: 90,
  condition: (alert) => alert.component === 'payment-service',
  handler: async (alert) => {
    console.log('Routing to payment team');
    return { team: 'payment', escalation: 'payment-policy' };
  }
});

router.addRoute({
  priority: 10,
  condition: (alert) => alert.severity === 'warning',
  handler: async (alert) => {
    console.log('Routing warning to Slack');
    return { handler: 'slack-only' };
  }
});

module.exports = router;
javascript
// alert-router.js
class AlertRouter {
  constructor() {
    this.routes = [];
  }

  addRoute(rule) {
    this.routes.push({
      priority: rule.priority || 0,
      condition: rule.condition,
      handler: rule.handler,
      escalation: rule.escalation
    });
    this.routes.sort((a, b) => b.priority - a.priority);
  }

  async route(alert) {
    for (const route of this.routes) {
      if (route.condition(alert)) {
        return await route.handler(alert, route.escalation);
      }
    }
    return this.defaultHandler(alert);
  }

  async defaultHandler(alert) {
    console.log('Routing to default handler:', alert.name);
    return { routed: true, handler: 'default' };
  }
}

// Usage
const router = new AlertRouter();

router.addRoute({
  priority: 100,
  condition: (alert) => alert.severity === 'critical' && alert.component === 'database',
  handler: async (alert) => {
    console.log('Routing critical database alert to DBA team');
    return { team: 'dba', escalation: 'immediate' };
  }
});

router.addRoute({
  priority: 90,
  condition: (alert) => alert.component === 'payment-service',
  handler: async (alert) => {
    console.log('Routing to payment team');
    return { team: 'payment', escalation: 'payment-policy' };
  }
});

router.addRoute({
  priority: 10,
  condition: (alert) => alert.severity === 'warning',
  handler: async (alert) => {
    console.log('Routing warning to Slack');
    return { handler: 'slack-only' };
  }
});

module.exports = router;

5. Docker Compose Alert Stack

5. Docker Compose告警栈

yaml
undefined
yaml
undefined

docker-compose.yml

docker-compose.yml

version: '3.8' services: prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml
alertmanager: image: prom/alertmanager:latest ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml environment: SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY} depends_on: - prometheus
alert-handler: build: . environment: PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN} SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} ports: - "3000:3000" depends_on: - alertmanager
undefined
version: '3.8' services: prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml
alertmanager: image: prom/alertmanager:latest ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml environment: SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY} depends_on: - prometheus
alert-handler: build: . environment: PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN} SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} ports: - "3000:3000" depends_on: - alertmanager
undefined

Best Practices

最佳实践

✅ DO

✅ 建议

  • Set appropriate thresholds
  • Implement alert deduplication
  • Use clear alert names
  • Include runbook links
  • Configure escalation properly
  • Test alert rules
  • Monitor alert quality
  • Set repeat intervals
  • Track alert metrics
  • Document alert meanings
  • 设置合适的阈值
  • 实现告警去重
  • 使用清晰的告警名称
  • 包含运行手册链接
  • 正确配置升级策略
  • 测试告警规则
  • 监控告警质量
  • 设置重复告警间隔
  • 跟踪告警指标
  • 文档化告警含义

❌ DON'T

❌ 不建议

  • Alert on every anomaly
  • Ignore alert fatigue
  • Set thresholds arbitrarily
  • Skip runbooks
  • Alert without action
  • Disable alerts in production
  • Use vague alert names
  • Forget escalation policies
  • Re-alert too frequently
  • 对每个异常都发送告警
  • 忽略告警疲劳
  • 随意设置阈值
  • 跳过运行手册
  • 发送无操作指引的告警
  • 在生产环境中禁用告警
  • 使用模糊的告警名称
  • 忘记配置升级策略
  • 过于频繁地重复告警

Alert Severity Levels

告警严重级别

  • Critical: Immediate action required, customer impact
  • Warning: Investigation needed, potential issues
  • Info: Informational, no action required
  • Critical(严重):需立即处理,已影响客户
  • Warning(警告):需调查,存在潜在问题
  • Info(信息):仅作通知,无需操作

Key Metrics

关键指标

  • Alert volume
  • Resolution time
  • False positive rate
  • Escalation frequency
  • MTTD (Mean Time to Detection)
  • MTTR (Mean Time to Resolution)
  • 告警数量
  • 解决时间
  • 误报率
  • 升级频率
  • MTTD(平均检测时间)
  • MTTR(平均恢复时间)