docker-compose-production

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

Docker Compose Production Deployment

Docker Compose 生产环境部署

Production-ready Docker Compose configurations with security, reliability, and scalability best practices.
具备安全性、可靠性和可扩展性最佳实践的生产就绪型Docker Compose配置。

Production-Ready Base Template

生产就绪基础模板

A comprehensive production template with essential configurations:
yaml
version: '3.8'

services:
  nginx:
    image: nginx:1.25-alpine
    container_name: production-nginx
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx-cache:/var/cache/nginx
      - nginx-logs:/var/log/nginx
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

  api:
    image: mycompany/api:${API_VERSION:-latest}
    container_name: production-api
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: 3000
    env_file:
      - .env.production
    secrets:
      - db_password
      - jwt_secret
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  worker:
    image: mycompany/worker:${WORKER_VERSION:-latest}
    container_name: production-worker
    restart: unless-stopped
    networks:
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      QUEUE_NAME: ${QUEUE_NAME:-default}
    env_file:
      - .env.production
    secrets:
      - db_password
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

  database:
    image: postgres:15-alpine
    container_name: production-db
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_INITDB_ARGS: "-E UTF8 --locale=en_US.UTF-8"
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
      - ./db/init:/docker-entrypoint-initdb.d:ro
      - postgres-logs:/var/log/postgresql
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    command:
      - "postgres"
      - "-c"
      - "max_connections=200"
      - "-c"
      - "shared_buffers=256MB"
      - "-c"
      - "effective_cache_size=1GB"
      - "-c"
      - "maintenance_work_mem=64MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "default_statistics_target=100"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "work_mem=1MB"
      - "-c"
      - "min_wal_size=1GB"
      - "-c"
      - "max_wal_size=4GB"
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  cache:
    image: redis:7-alpine
    container_name: production-cache
    restart: unless-stopped
    networks:
      - backend
    command: >
      redis-server
      --appendonly yes
      --appendfsync everysec
      --maxmemory 512mb
      --maxmemory-policy allkeys-lru
      --requirepass ${REDIS_PASSWORD}
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 768M
        reservations:
          cpus: '0.5'
          memory: 512M

  backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    container_name: production-backup
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
    depends_on:
      database:
        condition: service_healthy

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/postgres
  redis-data:
    driver: local
  nginx-cache:
    driver: local
  nginx-logs:
    driver: local
  postgres-logs:
    driver: local

secrets:
  db_password:
    file: ./secrets/db_password.txt
  jwt_secret:
    file: ./secrets/jwt_secret.txt
包含核心配置的全面生产模板:
yaml
version: '3.8'

services:
  nginx:
    image: nginx:1.25-alpine
    container_name: production-nginx
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx-cache:/var/cache/nginx
      - nginx-logs:/var/log/nginx
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

  api:
    image: mycompany/api:${API_VERSION:-latest}
    container_name: production-api
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: 3000
    env_file:
      - .env.production
    secrets:
      - db_password
      - jwt_secret
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  worker:
    image: mycompany/worker:${WORKER_VERSION:-latest}
    container_name: production-worker
    restart: unless-stopped
    networks:
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      QUEUE_NAME: ${QUEUE_NAME:-default}
    env_file:
      - .env.production
    secrets:
      - db_password
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

  database:
    image: postgres:15-alpine
    container_name: production-db
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_INITDB_ARGS: "-E UTF8 --locale=en_US.UTF-8"
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
      - ./db/init:/docker-entrypoint-initdb.d:ro
      - postgres-logs:/var/log/postgresql
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    command:
      - "postgres"
      - "-c"
      - "max_connections=200"
      - "-c"
      - "shared_buffers=256MB"
      - "-c"
      - "effective_cache_size=1GB"
      - "-c"
      - "maintenance_work_mem=64MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "default_statistics_target=100"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "work_mem=1MB"
      - "-c"
      - "min_wal_size=1GB"
      - "-c"
      - "max_wal_size=4GB"
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  cache:
    image: redis:7-alpine
    container_name: production-cache
    restart: unless-stopped
    networks:
      - backend
    command: >
      redis-server
      --appendonly yes
      --appendfsync everysec
      --maxmemory 512mb
      --maxmemory-policy allkeys-lru
      --requirepass ${REDIS_PASSWORD}
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 768M
        reservations:
          cpus: '0.5'
          memory: 512M

  backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    container_name: production-backup
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
    depends_on:
      database:
        condition: service_healthy

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/postgres
  redis-data:
    driver: local
  nginx-cache:
    driver: local
  nginx-logs:
    driver: local
  postgres-logs:
    driver: local

secrets:
  db_password:
    file: ./secrets/db_password.txt
  jwt_secret:
    file: ./secrets/jwt_secret.txt

Security Hardening

安全加固

Production security configurations:
yaml
version: '3.8'

services:
  web:
    image: nginx:1.25-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /var/cache/nginx
      - /var/run
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "nginx:nginx"
    networks:
      - frontend
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
    cap_drop:
      - ALL
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "1000:1000"
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
    env_file:
      - .env.production
    secrets:
      - source: db_password
        target: /run/secrets/db_password
        mode: 0400
      - source: api_key
        target: /run/secrets/api_key
        mode: 0400

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
      - /run/postgresql
    cap_drop:
      - ALL
    cap_add:
      - CHOWN
      - DAC_OVERRIDE
      - FOWNER
      - SETGID
      - SETUID
    security_opt:
      - no-new-privileges:true
    user: "postgres:postgres"
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - source: db_password
        mode: 0400
    volumes:
      - postgres-data:/var/lib/postgresql/data

networks:
  frontend:
    driver: bridge
    driver_opts:
      com.docker.network.bridge.enable_icc: "false"
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  api_key:
    file: ./secrets/api_key.txt
生产环境安全配置:
yaml
version: '3.8'

services:
  web:
    image: nginx:1.25-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /var/cache/nginx
      - /var/run
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "nginx:nginx"
    networks:
      - frontend
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
    cap_drop:
      - ALL
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "1000:1000"
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
    env_file:
      - .env.production
    secrets:
      - source: db_password
        target: /run/secrets/db_password
        mode: 0400
      - source: api_key
        target: /run/secrets/api_key
        mode: 0400

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
      - /run/postgresql
    cap_drop:
      - ALL
    cap_add:
      - CHOWN
      - DAC_OVERRIDE
      - FOWNER
      - SETGID
      - SETUID
    security_opt:
      - no-new-privileges:true
    user: "postgres:postgres"
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - source: db_password
        mode: 0400
    volumes:
      - postgres-data:/var/lib/postgresql/data

networks:
  frontend:
    driver: bridge
    driver_opts:
      com.docker.network.bridge.enable_icc: "false"
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  api_key:
    file: ./secrets/api_key.txt

Resource Limits and Reservations

资源限制与预留

Comprehensive resource management:
yaml
version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '0.50'
          memory: 256M
          pids: 100
        reservations:
          cpus: '0.25'
          memory: 128M
    ulimits:
      nofile:
        soft: 1024
        hard: 2048
      nproc:
        soft: 64
        hard: 128

  api:
    image: node:18-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
          pids: 200
        reservations:
          cpus: '1.0'
          memory: 1G
    ulimits:
      nofile:
        soft: 4096
        hard: 8192
      nproc:
        soft: 256
        hard: 512

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G
          pids: 500
        reservations:
          cpus: '2.0'
          memory: 2G
    ulimits:
      nofile:
        soft: 8192
        hard: 16384
    shm_size: '256mb'
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    sysctls:
      net.core.somaxconn: 1024
    volumes:
      - redis-data:/data

volumes:
  postgres-data:
  redis-data:
全面资源管理配置:
yaml
version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '0.50'
          memory: 256M
          pids: 100
        reservations:
          cpus: '0.25'
          memory: 128M
    ulimits:
      nofile:
        soft: 1024
        hard: 2048
      nproc:
        soft: 64
        hard: 128

  api:
    image: node:18-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
          pids: 200
        reservations:
          cpus: '1.0'
          memory: 1G
    ulimits:
      nofile:
        soft: 4096
        hard: 8192
      nproc:
        soft: 256
        hard: 512

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G
          pids: 500
        reservations:
          cpus: '2.0'
          memory: 2G
    ulimits:
      nofile:
        soft: 8192
        hard: 16384
    shm_size: '256mb'
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    sysctls:
      net.core.somaxconn: 1024
    volumes:
      - redis-data:/data

volumes:
  postgres-data:
  redis-data:

High Availability Configuration

高可用配置

Multiple replicas with load balancing:
yaml
version: '3.8'

services:
  loadbalancer:
    image: nginx:alpine
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx-lb.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 10s
      timeout: 5s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/app
      INSTANCE_ID: "{{.Task.Slot}}"
    deploy:
      replicas: 5
      update_config:
        parallelism: 2
        delay: 10s
        order: start-first
        failure_action: rollback
      rollback_config:
        parallelism: 2
        delay: 10s
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G

  database-replica:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_PRIMARY_HOST: database
      POSTGRES_PRIMARY_PORT: 5432
    secrets:
      - db_password
    volumes:
      - postgres-replica-data:/var/lib/postgresql/data
      - ./db/replica-setup.sh:/docker-entrypoint-initdb.d/replica-setup.sh:ro
    depends_on:
      database:
        condition: service_healthy
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
  postgres-replica-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
带负载均衡的多副本配置:
yaml
version: '3.8'

services:
  loadbalancer:
    image: nginx:alpine
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx-lb.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 10s
      timeout: 5s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/app
      INSTANCE_ID: "{{.Task.Slot}}"
    deploy:
      replicas: 5
      update_config:
        parallelism: 2
        delay: 10s
        order: start-first
        failure_action: rollback
      rollback_config:
        parallelism: 2
        delay: 10s
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G

  database-replica:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_PRIMARY_HOST: database
      POSTGRES_PRIMARY_PORT: 5432
    secrets:
      - db_password
    volumes:
      - postgres-replica-data:/var/lib/postgresql/data
      - ./db/replica-setup.sh:/docker-entrypoint-initdb.d/replica-setup.sh:ro
    depends_on:
      database:
        condition: service_healthy
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
  postgres-replica-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt

Monitoring and Observability

监控与可观测性

Production monitoring stack:
yaml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/alerts:/etc/prometheus/alerts:ro
      - prometheus-data:/prometheus
    networks:
      - monitoring
    ports:
      - "9090:9090"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 2G

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    environment:
      GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana_password
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource
      GF_SERVER_ROOT_URL: https://monitoring.example.com
    secrets:
      - grafana_password
    volumes:
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
    networks:
      - monitoring
      - frontend
    ports:
      - "3001:3000"
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 512M

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    restart: unless-stopped
    command:
      - '--path.rootfs=/host'
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /:/host:ro,rslave
    networks:
      - monitoring
    ports:
      - "9100:9100"
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 128M

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    privileged: true
    devices:
      - /dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro
    networks:
      - monitoring
    ports:
      - "8080:8080"
    deploy:
      resources:
        limits:
          cpus: '0.3'
          memory: 256M

  loki:
    image: grafana/loki:latest
    container_name: loki
    restart: unless-stopped
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - ./monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro
      - loki-data:/loki
    networks:
      - monitoring
    ports:
      - "3100:3100"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    restart: unless-stopped
    command: -config.file=/etc/promtail/config.yml
    volumes:
      - ./monitoring/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    networks:
      - monitoring
    depends_on:
      - loki
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 256M

networks:
  monitoring:
    driver: bridge
  frontend:
    driver: bridge

volumes:
  prometheus-data:
  grafana-data:
  loki-data:

secrets:
  grafana_password:
    file: ./secrets/grafana_password.txt
生产环境监控栈:
yaml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/alerts:/etc/prometheus/alerts:ro
      - prometheus-data:/prometheus
    networks:
      - monitoring
    ports:
      - "9090:9090"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 2G

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    environment:
      GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana_password
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource
      GF_SERVER_ROOT_URL: https://monitoring.example.com
    secrets:
      - grafana_password
    volumes:
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
    networks:
      - monitoring
      - frontend
    ports:
      - "3001:3000"
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 512M

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    restart: unless-stopped
    command:
      - '--path.rootfs=/host'
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /:/host:ro,rslave
    networks:
      - monitoring
    ports:
      - "9100:9100"
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 128M

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    privileged: true
    devices:
      - /dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro
    networks:
      - monitoring
    ports:
      - "8080:8080"
    deploy:
      resources:
        limits:
          cpus: '0.3'
          memory: 256M

  loki:
    image: grafana/loki:latest
    container_name: loki
    restart: unless-stopped
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - ./monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro
      - loki-data:/loki
    networks:
      - monitoring
    ports:
      - "3100:3100"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    restart: unless-stopped
    command: -config.file=/etc/promtail/config.yml
    volumes:
      - ./monitoring/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    networks:
      - monitoring
    depends_on:
      - loki
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 256M

networks:
  monitoring:
    driver: bridge
  frontend:
    driver: bridge

volumes:
  prometheus-data:
  grafana-data:
  loki-data:

secrets:
  grafana_password:
    file: ./secrets/grafana_password.txt

Logging Configuration

日志配置

Centralized logging setup:
yaml
version: '3.8'

services:
  app:
    image: myapp:latest
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
        labels: "app,environment,version"
        tag: "{{.Name}}/{{.ID}}"
    labels:
      app: "myapp"
      environment: "production"
      version: "${VERSION}"

  nginx:
    image: nginx:alpine
    restart: unless-stopped
    logging:
      driver: "syslog"
      options:
        syslog-address: "tcp://logserver:514"
        tag: "nginx"
        syslog-format: "rfc5424micro"

  api:
    image: api:latest
    restart: unless-stopped
    logging:
      driver: "fluentd"
      options:
        fluentd-address: "localhost:24224"
        tag: "docker.{{.Name}}"
        fluentd-async-connect: "true"
        fluentd-retry-wait: "1s"
        fluentd-max-retries: "30"

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "10"
        compress: "true"
    volumes:
      - postgres-data:/var/lib/postgresql/data

volumes:
  postgres-data:
集中式日志设置:
yaml
version: '3.8'

services:
  app:
    image: myapp:latest
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
        labels: "app,environment,version"
        tag: "{{.Name}}/{{.ID}}"
    labels:
      app: "myapp"
      environment: "production"
      version: "${VERSION}"

  nginx:
    image: nginx:alpine
    restart: unless-stopped
    logging:
      driver: "syslog"
      options:
        syslog-address: "tcp://logserver:514"
        tag: "nginx"
        syslog-format: "rfc5424micro"

  api:
    image: api:latest
    restart: unless-stopped
    logging:
      driver: "fluentd"
      options:
        fluentd-address: "localhost:24224"
        tag: "docker.{{.Name}}"
        fluentd-async-connect: "true"
        fluentd-retry-wait: "1s"
        fluentd-max-retries: "30"

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "10"
        compress: "true"
    volumes:
      - postgres-data:/var/lib/postgresql/data

volumes:
  postgres-data:

Environment Configuration Management

环境配置管理

Multi-environment setup:
yaml
version: '3.8'

services:
  app:
    image: myapp:${VERSION:-latest}
    restart: unless-stopped
    environment:
      NODE_ENV: ${NODE_ENV:-production}
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: ${APP_PORT:-3000}
      DATABASE_URL: postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}
      REDIS_URL: redis://:${REDIS_PASSWORD}@cache:6379
      JWT_SECRET: ${JWT_SECRET}
      API_TIMEOUT: ${API_TIMEOUT:-30000}
      MAX_CONNECTIONS: ${MAX_CONNECTIONS:-100}
    env_file:
      - .env.${ENVIRONMENT:-production}
      - .env.secrets
    networks:
      - app-network

  database:
    image: postgres:${POSTGRES_VERSION:-15}-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD: ${DB_PASSWORD}
      POSTGRES_INITDB_ARGS: ${POSTGRES_INITDB_ARGS:--E UTF8}
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - app-network

  cache:
    image: redis:${REDIS_VERSION:-7}-alpine
    restart: unless-stopped
    command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory ${REDIS_MAX_MEMORY:-256mb}
    volumes:
      - redis-data:/data
    networks:
      - app-network

networks:
  app-network:
    driver: bridge

volumes:
  postgres-data:
  redis-data:
多环境设置:
yaml
version: '3.8'

services:
  app:
    image: myapp:${VERSION:-latest}
    restart: unless-stopped
    environment:
      NODE_ENV: ${NODE_ENV:-production}
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: ${APP_PORT:-3000}
      DATABASE_URL: postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}
      REDIS_URL: redis://:${REDIS_PASSWORD}@cache:6379
      JWT_SECRET: ${JWT_SECRET}
      API_TIMEOUT: ${API_TIMEOUT:-30000}
      MAX_CONNECTIONS: ${MAX_CONNECTIONS:-100}
    env_file:
      - .env.${ENVIRONMENT:-production}
      - .env.secrets
    networks:
      - app-network

  database:
    image: postgres:${POSTGRES_VERSION:-15}-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD: ${DB_PASSWORD}
      POSTGRES_INITDB_ARGS: ${POSTGRES_INITDB_ARGS:--E UTF8}
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - app-network

  cache:
    image: redis:${REDIS_VERSION:-7}-alpine
    restart: unless-stopped
    command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory ${REDIS_MAX_MEMORY:-256mb}
    volumes:
      - redis-data:/data
    networks:
      - app-network

networks:
  app-network:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

Health Checks and Readiness

健康检查与就绪性

Comprehensive health monitoring:
yaml
version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    image: node:18-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    volumes:
      - redis-data:/data

  queue:
    image: rabbitmq:3-management-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "ping"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    volumes:
      - rabbitmq-data:/var/lib/rabbitmq

volumes:
  postgres-data:
  redis-data:
  rabbitmq-data:
全面健康监控配置:
yaml
version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    image: node:18-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    volumes:
      - redis-data:/data

  queue:
    image: rabbitmq:3-management-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "ping"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    volumes:
      - rabbitmq-data:/var/lib/rabbitmq

volumes:
  postgres-data:
  redis-data:
  rabbitmq-data:

Backup and Recovery

备份与恢复

Automated backup configuration:
yaml
version: '3.8'

services:
  database:
    image: postgres:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - backend

  db-backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      BACKUP_DIR: /backups
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
      - ./backup-scripts:/scripts:ro
    networks:
      - backend
    depends_on:
      database:
        condition: service_healthy

  volume-backup:
    image: futurice/docker-volume-backup:2.6.0
    restart: unless-stopped
    environment:
      BACKUP_CRON_EXPRESSION: "0 2 * * *"
      BACKUP_FILENAME: "backup-%Y-%m-%d_%H-%M-%S.tar.gz"
      BACKUP_RETENTION_DAYS: 30
      AWS_S3_BUCKET_NAME: ${S3_BACKUP_BUCKET}
      AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
      AWS_SECRET_ACCESS_KEY_FILE: /run/secrets/aws_secret
    secrets:
      - aws_secret
    volumes:
      - postgres-data:/backup/postgres-data:ro
      - redis-data:/backup/redis-data:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - ./backup-archive:/archive

networks:
  backend:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  aws_secret:
    file: ./secrets/aws_secret.txt
自动化备份配置:
yaml
version: '3.8'

services:
  database:
    image: postgres:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - backend

  db-backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      BACKUP_DIR: /backups
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
      - ./backup-scripts:/scripts:ro
    networks:
      - backend
    depends_on:
      database:
        condition: service_healthy

  volume-backup:
    image: futurice/docker-volume-backup:2.6.0
    restart: unless-stopped
    environment:
      BACKUP_CRON_EXPRESSION: "0 2 * * *"
      BACKUP_FILENAME: "backup-%Y-%m-%d_%H-%M-%S.tar.gz"
      BACKUP_RETENTION_DAYS: 30
      AWS_S3_BUCKET_NAME: ${S3_BACKUP_BUCKET}
      AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
      AWS_SECRET_ACCESS_KEY_FILE: /run/secrets/aws_secret
    secrets:
      - aws_secret
    volumes:
      - postgres-data:/backup/postgres-data:ro
      - redis-data:/backup/redis-data:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - ./backup-archive:/archive

networks:
  backend:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  aws_secret:
    file: ./secrets/aws_secret.txt

When to Use This Skill

何时使用本方案

Use docker-compose-production when you need to:
  • Deploy Docker Compose applications to production environments
  • Implement security hardening and best practices
  • Configure resource limits and reservations
  • Set up health checks and readiness probes
  • Implement high availability with multiple replicas
  • Configure production-grade logging and monitoring
  • Set up automated backups and disaster recovery
  • Manage secrets and sensitive configuration
  • Implement zero-downtime deployments
  • Configure multi-environment deployment strategies
  • Set up container orchestration for production workloads
  • Optimize performance and resource utilization
当你需要以下场景时,使用docker-compose-production:
  • 将Docker Compose应用部署到生产环境
  • 实施安全加固及最佳实践
  • 配置资源限制与预留
  • 设置健康检查和就绪探针
  • 实现多副本高可用
  • 配置生产级日志与监控
  • 设置自动化备份与灾难恢复
  • 管理密钥和敏感配置
  • 实现零停机部署
  • 配置多环境部署策略
  • 为生产工作负载设置容器编排
  • 优化性能和资源利用率

Best Practices

最佳实践

  1. Always Use Version Pinning: Pin specific image versions instead of using
    latest
    to ensure reproducible deployments.
  2. Implement Health Checks: Configure health checks for all services to enable automatic recovery and proper dependency management.
  3. Set Resource Limits: Always define CPU and memory limits to prevent resource exhaustion and ensure predictable performance.
  4. Use Secrets Management: Never store secrets in environment variables or compose files; use Docker secrets or external secret managers.
  5. Configure Restart Policies: Use
    restart: unless-stopped
    for production services to ensure automatic recovery from failures.
  6. Implement Proper Logging: Configure structured logging with rotation and retention policies to manage disk space.
  7. Use Read-Only Filesystems: Set
    read_only: true
    where possible and use tmpfs for temporary data to improve security.
  8. Drop Unnecessary Capabilities: Use
    cap_drop: ALL
    and only add required capabilities to follow the principle of least privilege.
  9. Enable Monitoring: Deploy monitoring and observability tools to track application health and performance metrics.
  10. Implement Automated Backups: Configure regular automated backups with retention policies and test recovery procedures.
  11. Use Internal Networks: Mark backend networks as internal to prevent direct external access to databases and caches.
  12. Configure Update Strategies: Define update and rollback configurations for zero-downtime deployments.
  13. Implement Resource Reservations: Set resource reservations to guarantee minimum resources for critical services.
  14. Use Multi-Stage Dependencies: Configure
    depends_on
    with health check conditions to ensure proper startup order.
  15. Document Configuration: Maintain comprehensive documentation of your production configuration and deployment procedures.
  1. 始终使用版本固定:固定具体的镜像版本,而非使用
    latest
    标签,确保部署的可重复性。
  2. 实施健康检查:为所有服务配置健康检查,实现自动恢复和正确的依赖管理。
  3. 设置资源限制:始终定义CPU和内存限制,防止资源耗尽,确保性能可预测。
  4. 使用密钥管理:切勿将密钥存储在环境变量或Compose文件中;使用Docker Secrets或外部密钥管理器。
  5. 配置重启策略:生产服务使用
    restart: unless-stopped
    ,确保故障时自动恢复。
  6. 实施正确的日志配置:配置结构化日志及轮转和保留策略,管理磁盘空间。
  7. 使用只读文件系统:尽可能设置
    read_only: true
    ,并使用tmpfs存储临时数据,提升安全性。
  8. 移除不必要的权限:使用
    cap_drop: ALL
    ,仅添加必要的权限,遵循最小权限原则。
  9. 启用监控:部署监控和可观测性工具,跟踪应用健康状况和性能指标。
  10. 实施自动化备份:配置定期自动化备份及保留策略,并测试恢复流程。
  11. 使用内部网络:将后端网络标记为内部网络,防止数据库和缓存直接暴露到外部。
  12. 配置更新策略:定义更新和回滚配置,实现零停机部署。
  13. 实施资源预留:设置资源预留,为关键服务保证最低资源。
  14. 使用多阶段依赖:配置
    depends_on
    并结合健康检查条件,确保正确的启动顺序。
  15. 文档化配置:维护生产配置和部署流程的全面文档。

Common Pitfalls

常见陷阱

  1. Using Latest Tags: Using
    latest
    or unversioned images can cause unexpected behavior when images are updated; always pin versions.
  2. Ignoring Resource Limits: Not setting resource limits can allow one service to consume all available resources and crash others.
  3. Missing Health Checks: Without health checks, Docker cannot determine if services are actually ready or need to be restarted.
  4. Storing Secrets in Plain Text: Committing secrets to version control or storing them in environment variables exposes sensitive data.
  5. Not Testing Backups: Creating backups without regularly testing restoration procedures leads to data loss during actual incidents.
  6. Exposing Unnecessary Ports: Publishing all service ports to the host increases attack surface; only expose what's needed.
  7. Running as Root: Not specifying a non-root user leaves containers vulnerable to privilege escalation attacks.
  8. Ignoring Log Rotation: Without log rotation, logs can fill up disk space and crash services or hosts.
  9. Missing Monitoring: Deploying without monitoring makes it impossible to detect and diagnose issues before they impact users.
  10. Not Using Networks: Running all services on the default network prevents proper segmentation and increases security risk.
  11. Forgetting Readiness Checks: Starting dependent services before dependencies are ready causes connection failures and restarts.
  12. Hardcoding Configuration: Embedding environment-specific values in the compose file makes it difficult to deploy to multiple environments.
  13. Neglecting Security Updates: Not regularly updating base images leaves services vulnerable to known security issues.
  14. Insufficient Start Period: Setting health check start periods too short causes false positives during slow application startup.
  15. Not Planning for Scale: Designing services without considering horizontal scaling makes it difficult to handle increased load.
  1. 使用Latest标签:使用
    latest
    或无版本镜像会在镜像更新时导致意外行为;始终固定版本。
  2. 忽略资源限制:不设置资源限制会导致单个服务消耗所有可用资源,导致其他服务崩溃。
  3. 缺少健康检查:没有健康检查,Docker无法判断服务是否真正就绪或需要重启。
  4. 明文存储密钥:将密钥提交到版本控制或存储在环境变量中会暴露敏感数据。
  5. 不测试备份:仅创建备份而不定期测试恢复流程,会在实际事故中导致数据丢失。
  6. 暴露不必要的端口:将所有服务端口发布到主机增加攻击面;仅暴露必要的端口。
  7. 以Root用户运行:不指定非Root用户会使容器面临权限提升攻击的风险。
  8. 忽略日志轮转:没有日志轮转,日志会占满磁盘空间,导致服务或主机崩溃。
  9. 缺少监控:部署时不配置监控,无法在影响用户前检测和诊断问题。
  10. 不使用网络:所有服务运行在默认网络上会导致无法正确分段,增加安全风险。
  11. 忘记就绪检查:在依赖项就绪前启动依赖服务会导致连接失败和重启。
  12. 硬编码配置:在Compose文件中嵌入环境特定值会导致难以部署到多环境。
  13. 忽略安全更新:不定期更新基础镜像会使服务面临已知安全漏洞的风险。
  14. 启动周期不足:健康检查启动周期设置过短会在应用启动缓慢时导致误报。
  15. 未规划扩展:设计服务时不考虑水平扩展会导致难以处理增长的负载。

Resources

资源

Official Documentation

官方文档

Deployment Guides

部署指南

Tools and Images

工具与镜像

Monitoring

监控