docker-compose-production

Compare original and translation side by side

🇺🇸

Original

English

🇨🇳

Translation

Chinese

Docker Compose Production Deployment

Docker Compose 生产环境部署

Production-ready Docker Compose configurations with security, reliability, and scalability best practices.

具备安全性、可靠性和可扩展性最佳实践的生产就绪型Docker Compose配置。

Production-Ready Base Template

生产就绪基础模板

A comprehensive production template with essential configurations:

yaml

version: '3.8'

services:
  nginx:
    image: nginx:1.25-alpine
    container_name: production-nginx
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx-cache:/var/cache/nginx
      - nginx-logs:/var/log/nginx
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

  api:
    image: mycompany/api:${API_VERSION:-latest}
    container_name: production-api
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: 3000
    env_file:
      - .env.production
    secrets:
      - db_password
      - jwt_secret
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  worker:
    image: mycompany/worker:${WORKER_VERSION:-latest}
    container_name: production-worker
    restart: unless-stopped
    networks:
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      QUEUE_NAME: ${QUEUE_NAME:-default}
    env_file:
      - .env.production
    secrets:
      - db_password
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

  database:
    image: postgres:15-alpine
    container_name: production-db
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_INITDB_ARGS: "-E UTF8 --locale=en_US.UTF-8"
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
      - ./db/init:/docker-entrypoint-initdb.d:ro
      - postgres-logs:/var/log/postgresql
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    command:
      - "postgres"
      - "-c"
      - "max_connections=200"
      - "-c"
      - "shared_buffers=256MB"
      - "-c"
      - "effective_cache_size=1GB"
      - "-c"
      - "maintenance_work_mem=64MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "default_statistics_target=100"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "work_mem=1MB"
      - "-c"
      - "min_wal_size=1GB"
      - "-c"
      - "max_wal_size=4GB"
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  cache:
    image: redis:7-alpine
    container_name: production-cache
    restart: unless-stopped
    networks:
      - backend
    command: >
      redis-server
      --appendonly yes
      --appendfsync everysec
      --maxmemory 512mb
      --maxmemory-policy allkeys-lru
      --requirepass ${REDIS_PASSWORD}
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 768M
        reservations:
          cpus: '0.5'
          memory: 512M

  backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    container_name: production-backup
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
    depends_on:
      database:
        condition: service_healthy

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/postgres
  redis-data:
    driver: local
  nginx-cache:
    driver: local
  nginx-logs:
    driver: local
  postgres-logs:
    driver: local

secrets:
  db_password:
    file: ./secrets/db_password.txt
  jwt_secret:
    file: ./secrets/jwt_secret.txt

包含核心配置的全面生产模板：

yaml

version: '3.8'

services:
  nginx:
    image: nginx:1.25-alpine
    container_name: production-nginx
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx-cache:/var/cache/nginx
      - nginx-logs:/var/log/nginx
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

  api:
    image: mycompany/api:${API_VERSION:-latest}
    container_name: production-api
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: 3000
    env_file:
      - .env.production
    secrets:
      - db_password
      - jwt_secret
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  worker:
    image: mycompany/worker:${WORKER_VERSION:-latest}
    container_name: production-worker
    restart: unless-stopped
    networks:
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/production_db
      REDIS_URL: redis://cache:6379
      QUEUE_NAME: ${QUEUE_NAME:-default}
    env_file:
      - .env.production
    secrets:
      - db_password
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

  database:
    image: postgres:15-alpine
    container_name: production-db
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_INITDB_ARGS: "-E UTF8 --locale=en_US.UTF-8"
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
      - ./db/init:/docker-entrypoint-initdb.d:ro
      - postgres-logs:/var/log/postgresql
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    command:
      - "postgres"
      - "-c"
      - "max_connections=200"
      - "-c"
      - "shared_buffers=256MB"
      - "-c"
      - "effective_cache_size=1GB"
      - "-c"
      - "maintenance_work_mem=64MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "default_statistics_target=100"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "work_mem=1MB"
      - "-c"
      - "min_wal_size=1GB"
      - "-c"
      - "max_wal_size=4GB"
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

  cache:
    image: redis:7-alpine
    container_name: production-cache
    restart: unless-stopped
    networks:
      - backend
    command: >
      redis-server
      --appendonly yes
      --appendfsync everysec
      --maxmemory 512mb
      --maxmemory-policy allkeys-lru
      --requirepass ${REDIS_PASSWORD}
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 768M
        reservations:
          cpus: '0.5'
          memory: 512M

  backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    container_name: production-backup
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: production_db
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
    depends_on:
      database:
        condition: service_healthy

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/postgres
  redis-data:
    driver: local
  nginx-cache:
    driver: local
  nginx-logs:
    driver: local
  postgres-logs:
    driver: local

secrets:
  db_password:
    file: ./secrets/db_password.txt
  jwt_secret:
    file: ./secrets/jwt_secret.txt

Security Hardening

安全加固

Production security configurations:

yaml

version: '3.8'

services:
  web:
    image: nginx:1.25-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /var/cache/nginx
      - /var/run
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "nginx:nginx"
    networks:
      - frontend
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
    cap_drop:
      - ALL
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "1000:1000"
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
    env_file:
      - .env.production
    secrets:
      - source: db_password
        target: /run/secrets/db_password
        mode: 0400
      - source: api_key
        target: /run/secrets/api_key
        mode: 0400

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
      - /run/postgresql
    cap_drop:
      - ALL
    cap_add:
      - CHOWN
      - DAC_OVERRIDE
      - FOWNER
      - SETGID
      - SETUID
    security_opt:
      - no-new-privileges:true
    user: "postgres:postgres"
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - source: db_password
        mode: 0400
    volumes:
      - postgres-data:/var/lib/postgresql/data

networks:
  frontend:
    driver: bridge
    driver_opts:
      com.docker.network.bridge.enable_icc: "false"
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  api_key:
    file: ./secrets/api_key.txt

生产环境安全配置：

yaml

version: '3.8'

services:
  web:
    image: nginx:1.25-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /var/cache/nginx
      - /var/run
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "nginx:nginx"
    networks:
      - frontend
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
    cap_drop:
      - ALL
    security_opt:
      - no-new-privileges:true
      - seccomp:./security/seccomp-profile.json
    user: "1000:1000"
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
    env_file:
      - .env.production
    secrets:
      - source: db_password
        target: /run/secrets/db_password
        mode: 0400
      - source: api_key
        target: /run/secrets/api_key
        mode: 0400

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp
      - /run/postgresql
    cap_drop:
      - ALL
    cap_add:
      - CHOWN
      - DAC_OVERRIDE
      - FOWNER
      - SETGID
      - SETUID
    security_opt:
      - no-new-privileges:true
    user: "postgres:postgres"
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - source: db_password
        mode: 0400
    volumes:
      - postgres-data:/var/lib/postgresql/data

networks:
  frontend:
    driver: bridge
    driver_opts:
      com.docker.network.bridge.enable_icc: "false"
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  api_key:
    file: ./secrets/api_key.txt

Resource Limits and Reservations

资源限制与预留

Comprehensive resource management:

yaml

version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '0.50'
          memory: 256M
          pids: 100
        reservations:
          cpus: '0.25'
          memory: 128M
    ulimits:
      nofile:
        soft: 1024
        hard: 2048
      nproc:
        soft: 64
        hard: 128

  api:
    image: node:18-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
          pids: 200
        reservations:
          cpus: '1.0'
          memory: 1G
    ulimits:
      nofile:
        soft: 4096
        hard: 8192
      nproc:
        soft: 256
        hard: 512

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G
          pids: 500
        reservations:
          cpus: '2.0'
          memory: 2G
    ulimits:
      nofile:
        soft: 8192
        hard: 16384
    shm_size: '256mb'
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    sysctls:
      net.core.somaxconn: 1024
    volumes:
      - redis-data:/data

volumes:
  postgres-data:
  redis-data:

全面资源管理配置：

yaml

version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '0.50'
          memory: 256M
          pids: 100
        reservations:
          cpus: '0.25'
          memory: 128M
    ulimits:
      nofile:
        soft: 1024
        hard: 2048
      nproc:
        soft: 64
        hard: 128

  api:
    image: node:18-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
          pids: 200
        reservations:
          cpus: '1.0'
          memory: 1G
    ulimits:
      nofile:
        soft: 4096
        hard: 8192
      nproc:
        soft: 256
        hard: 512

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G
          pids: 500
        reservations:
          cpus: '2.0'
          memory: 2G
    ulimits:
      nofile:
        soft: 8192
        hard: 16384
    shm_size: '256mb'
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    sysctls:
      net.core.somaxconn: 1024
    volumes:
      - redis-data:/data

volumes:
  postgres-data:
  redis-data:

High Availability Configuration

高可用配置

Multiple replicas with load balancing:

yaml

version: '3.8'

services:
  loadbalancer:
    image: nginx:alpine
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx-lb.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 10s
      timeout: 5s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/app
      INSTANCE_ID: "{{.Task.Slot}}"
    deploy:
      replicas: 5
      update_config:
        parallelism: 2
        delay: 10s
        order: start-first
        failure_action: rollback
      rollback_config:
        parallelism: 2
        delay: 10s
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G

  database-replica:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_PRIMARY_HOST: database
      POSTGRES_PRIMARY_PORT: 5432
    secrets:
      - db_password
    volumes:
      - postgres-replica-data:/var/lib/postgresql/data
      - ./db/replica-setup.sh:/docker-entrypoint-initdb.d/replica-setup.sh:ro
    depends_on:
      database:
        condition: service_healthy
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
  postgres-replica-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt

带负载均衡的多副本配置：

yaml

version: '3.8'

services:
  loadbalancer:
    image: nginx:alpine
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx-lb.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
    networks:
      - frontend
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 10s
      timeout: 5s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M

  api:
    image: mycompany/api:${VERSION}
    restart: unless-stopped
    networks:
      - frontend
      - backend
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:5432/app
      INSTANCE_ID: "{{.Task.Slot}}"
    deploy:
      replicas: 5
      update_config:
        parallelism: 2
        delay: 10s
        order: start-first
        failure_action: rollback
      rollback_config:
        parallelism: 2
        delay: 10s
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 4G

  database-replica:
    image: postgres:15-alpine
    restart: unless-stopped
    networks:
      - backend
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      POSTGRES_PRIMARY_HOST: database
      POSTGRES_PRIMARY_PORT: 5432
    secrets:
      - db_password
    volumes:
      - postgres-replica-data:/var/lib/postgresql/data
      - ./db/replica-setup.sh:/docker-entrypoint-initdb.d/replica-setup.sh:ro
    depends_on:
      database:
        condition: service_healthy
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

volumes:
  postgres-data:
  postgres-replica-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt

Monitoring and Observability

监控与可观测性

Production monitoring stack:

yaml

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/alerts:/etc/prometheus/alerts:ro
      - prometheus-data:/prometheus
    networks:
      - monitoring
    ports:
      - "9090:9090"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 2G

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    environment:
      GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana_password
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource
      GF_SERVER_ROOT_URL: https://monitoring.example.com
    secrets:
      - grafana_password
    volumes:
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
    networks:
      - monitoring
      - frontend
    ports:
      - "3001:3000"
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 512M

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    restart: unless-stopped
    command:
      - '--path.rootfs=/host'
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /:/host:ro,rslave
    networks:
      - monitoring
    ports:
      - "9100:9100"
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 128M

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    privileged: true
    devices:
      - /dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro
    networks:
      - monitoring
    ports:
      - "8080:8080"
    deploy:
      resources:
        limits:
          cpus: '0.3'
          memory: 256M

  loki:
    image: grafana/loki:latest
    container_name: loki
    restart: unless-stopped
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - ./monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro
      - loki-data:/loki
    networks:
      - monitoring
    ports:
      - "3100:3100"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    restart: unless-stopped
    command: -config.file=/etc/promtail/config.yml
    volumes:
      - ./monitoring/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    networks:
      - monitoring
    depends_on:
      - loki
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 256M

networks:
  monitoring:
    driver: bridge
  frontend:
    driver: bridge

volumes:
  prometheus-data:
  grafana-data:
  loki-data:

secrets:
  grafana_password:
    file: ./secrets/grafana_password.txt

生产环境监控栈：

yaml

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/alerts:/etc/prometheus/alerts:ro
      - prometheus-data:/prometheus
    networks:
      - monitoring
    ports:
      - "9090:9090"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 2G

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    environment:
      GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana_password
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource
      GF_SERVER_ROOT_URL: https://monitoring.example.com
    secrets:
      - grafana_password
    volumes:
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
    networks:
      - monitoring
      - frontend
    ports:
      - "3001:3000"
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 512M

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    restart: unless-stopped
    command:
      - '--path.rootfs=/host'
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /:/host:ro,rslave
    networks:
      - monitoring
    ports:
      - "9100:9100"
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 128M

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    privileged: true
    devices:
      - /dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro
    networks:
      - monitoring
    ports:
      - "8080:8080"
    deploy:
      resources:
        limits:
          cpus: '0.3'
          memory: 256M

  loki:
    image: grafana/loki:latest
    container_name: loki
    restart: unless-stopped
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - ./monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro
      - loki-data:/loki
    networks:
      - monitoring
    ports:
      - "3100:3100"
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G

  promtail:
    image: grafana/promtail:latest
    container_name: promtail
    restart: unless-stopped
    command: -config.file=/etc/promtail/config.yml
    volumes:
      - ./monitoring/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    networks:
      - monitoring
    depends_on:
      - loki
    deploy:
      resources:
        limits:
          cpus: '0.2'
          memory: 256M

networks:
  monitoring:
    driver: bridge
  frontend:
    driver: bridge

volumes:
  prometheus-data:
  grafana-data:
  loki-data:

secrets:
  grafana_password:
    file: ./secrets/grafana_password.txt

Logging Configuration

日志配置

Centralized logging setup:

yaml

version: '3.8'

services:
  app:
    image: myapp:latest
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
        labels: "app,environment,version"
        tag: "{{.Name}}/{{.ID}}"
    labels:
      app: "myapp"
      environment: "production"
      version: "${VERSION}"

  nginx:
    image: nginx:alpine
    restart: unless-stopped
    logging:
      driver: "syslog"
      options:
        syslog-address: "tcp://logserver:514"
        tag: "nginx"
        syslog-format: "rfc5424micro"

  api:
    image: api:latest
    restart: unless-stopped
    logging:
      driver: "fluentd"
      options:
        fluentd-address: "localhost:24224"
        tag: "docker.{{.Name}}"
        fluentd-async-connect: "true"
        fluentd-retry-wait: "1s"
        fluentd-max-retries: "30"

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "10"
        compress: "true"
    volumes:
      - postgres-data:/var/lib/postgresql/data

volumes:
  postgres-data:

集中式日志设置：

yaml

version: '3.8'

services:
  app:
    image: myapp:latest
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "5"
        labels: "app,environment,version"
        tag: "{{.Name}}/{{.ID}}"
    labels:
      app: "myapp"
      environment: "production"
      version: "${VERSION}"

  nginx:
    image: nginx:alpine
    restart: unless-stopped
    logging:
      driver: "syslog"
      options:
        syslog-address: "tcp://logserver:514"
        tag: "nginx"
        syslog-format: "rfc5424micro"

  api:
    image: api:latest
    restart: unless-stopped
    logging:
      driver: "fluentd"
      options:
        fluentd-address: "localhost:24224"
        tag: "docker.{{.Name}}"
        fluentd-async-connect: "true"
        fluentd-retry-wait: "1s"
        fluentd-max-retries: "30"

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "10"
        compress: "true"
    volumes:
      - postgres-data:/var/lib/postgresql/data

volumes:
  postgres-data:

Environment Configuration Management

环境配置管理

Multi-environment setup:

yaml

version: '3.8'

services:
  app:
    image: myapp:${VERSION:-latest}
    restart: unless-stopped
    environment:
      NODE_ENV: ${NODE_ENV:-production}
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: ${APP_PORT:-3000}
      DATABASE_URL: postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}
      REDIS_URL: redis://:${REDIS_PASSWORD}@cache:6379
      JWT_SECRET: ${JWT_SECRET}
      API_TIMEOUT: ${API_TIMEOUT:-30000}
      MAX_CONNECTIONS: ${MAX_CONNECTIONS:-100}
    env_file:
      - .env.${ENVIRONMENT:-production}
      - .env.secrets
    networks:
      - app-network

  database:
    image: postgres:${POSTGRES_VERSION:-15}-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD: ${DB_PASSWORD}
      POSTGRES_INITDB_ARGS: ${POSTGRES_INITDB_ARGS:--E UTF8}
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - app-network

  cache:
    image: redis:${REDIS_VERSION:-7}-alpine
    restart: unless-stopped
    command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory ${REDIS_MAX_MEMORY:-256mb}
    volumes:
      - redis-data:/data
    networks:
      - app-network

networks:
  app-network:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

多环境设置：

yaml

version: '3.8'

services:
  app:
    image: myapp:${VERSION:-latest}
    restart: unless-stopped
    environment:
      NODE_ENV: ${NODE_ENV:-production}
      LOG_LEVEL: ${LOG_LEVEL:-info}
      PORT: ${APP_PORT:-3000}
      DATABASE_URL: postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}
      REDIS_URL: redis://:${REDIS_PASSWORD}@cache:6379
      JWT_SECRET: ${JWT_SECRET}
      API_TIMEOUT: ${API_TIMEOUT:-30000}
      MAX_CONNECTIONS: ${MAX_CONNECTIONS:-100}
    env_file:
      - .env.${ENVIRONMENT:-production}
      - .env.secrets
    networks:
      - app-network

  database:
    image: postgres:${POSTGRES_VERSION:-15}-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD: ${DB_PASSWORD}
      POSTGRES_INITDB_ARGS: ${POSTGRES_INITDB_ARGS:--E UTF8}
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - app-network

  cache:
    image: redis:${REDIS_VERSION:-7}-alpine
    restart: unless-stopped
    command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory ${REDIS_MAX_MEMORY:-256mb}
    volumes:
      - redis-data:/data
    networks:
      - app-network

networks:
  app-network:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

Health Checks and Readiness

健康检查与就绪性

Comprehensive health monitoring:

yaml

version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    image: node:18-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    volumes:
      - redis-data:/data

  queue:
    image: rabbitmq:3-management-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "ping"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    volumes:
      - rabbitmq-data:/var/lib/rabbitmq

volumes:
  postgres-data:
  redis-data:
  rabbitmq-data:

全面健康监控配置：

yaml

version: '3.8'

services:
  web:
    image: nginx:alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    image: node:18-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy

  database:
    image: postgres:15-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres -d production_db || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    volumes:
      - postgres-data:/var/lib/postgresql/data

  cache:
    image: redis:7-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 20s
    volumes:
      - redis-data:/data

  queue:
    image: rabbitmq:3-management-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "ping"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    volumes:
      - rabbitmq-data:/var/lib/rabbitmq

volumes:
  postgres-data:
  redis-data:
  rabbitmq-data:

Backup and Recovery

备份与恢复

Automated backup configuration:

yaml

version: '3.8'

services:
  database:
    image: postgres:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - backend

  db-backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      BACKUP_DIR: /backups
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
      - ./backup-scripts:/scripts:ro
    networks:
      - backend
    depends_on:
      database:
        condition: service_healthy

  volume-backup:
    image: futurice/docker-volume-backup:2.6.0
    restart: unless-stopped
    environment:
      BACKUP_CRON_EXPRESSION: "0 2 * * *"
      BACKUP_FILENAME: "backup-%Y-%m-%d_%H-%M-%S.tar.gz"
      BACKUP_RETENTION_DAYS: 30
      AWS_S3_BUCKET_NAME: ${S3_BACKUP_BUCKET}
      AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
      AWS_SECRET_ACCESS_KEY_FILE: /run/secrets/aws_secret
    secrets:
      - aws_secret
    volumes:
      - postgres-data:/backup/postgres-data:ro
      - redis-data:/backup/redis-data:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - ./backup-archive:/archive

networks:
  backend:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  aws_secret:
    file: ./secrets/aws_secret.txt

自动化备份配置：

yaml

version: '3.8'

services:
  database:
    image: postgres:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - backend

  db-backup:
    image: prodrigestivill/postgres-backup-local:15-alpine
    restart: unless-stopped
    environment:
      POSTGRES_HOST: database
      POSTGRES_DB: ${DB_NAME}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
      SCHEDULE: "@daily"
      BACKUP_KEEP_DAYS: 7
      BACKUP_KEEP_WEEKS: 4
      BACKUP_KEEP_MONTHS: 6
      BACKUP_DIR: /backups
      HEALTHCHECK_PORT: 8080
    secrets:
      - db_password
    volumes:
      - ./backups:/backups
      - ./backup-scripts:/scripts:ro
    networks:
      - backend
    depends_on:
      database:
        condition: service_healthy

  volume-backup:
    image: futurice/docker-volume-backup:2.6.0
    restart: unless-stopped
    environment:
      BACKUP_CRON_EXPRESSION: "0 2 * * *"
      BACKUP_FILENAME: "backup-%Y-%m-%d_%H-%M-%S.tar.gz"
      BACKUP_RETENTION_DAYS: 30
      AWS_S3_BUCKET_NAME: ${S3_BACKUP_BUCKET}
      AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
      AWS_SECRET_ACCESS_KEY_FILE: /run/secrets/aws_secret
    secrets:
      - aws_secret
    volumes:
      - postgres-data:/backup/postgres-data:ro
      - redis-data:/backup/redis-data:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - ./backup-archive:/archive

networks:
  backend:
    driver: bridge

volumes:
  postgres-data:
  redis-data:

secrets:
  db_password:
    file: ./secrets/db_password.txt
  aws_secret:
    file: ./secrets/aws_secret.txt

When to Use This Skill

何时使用本方案

Use docker-compose-production when you need to:

Deploy Docker Compose applications to production environments
Implement security hardening and best practices
Configure resource limits and reservations
Set up health checks and readiness probes
Implement high availability with multiple replicas
Configure production-grade logging and monitoring
Set up automated backups and disaster recovery
Manage secrets and sensitive configuration
Implement zero-downtime deployments
Configure multi-environment deployment strategies
Set up container orchestration for production workloads
Optimize performance and resource utilization

当你需要以下场景时，使用docker-compose-production：

将Docker Compose应用部署到生产环境
实施安全加固及最佳实践
配置资源限制与预留
设置健康检查和就绪探针
实现多副本高可用
配置生产级日志与监控
设置自动化备份与灾难恢复
管理密钥和敏感配置
实现零停机部署
配置多环境部署策略
为生产工作负载设置容器编排
优化性能和资源利用率

Best Practices

最佳实践

Always Use Version Pinning: Pin specific image versions instead of using
```
latest
```
to ensure reproducible deployments.
Implement Health Checks: Configure health checks for all services to enable automatic recovery and proper dependency management.
Set Resource Limits: Always define CPU and memory limits to prevent resource exhaustion and ensure predictable performance.
Use Secrets Management: Never store secrets in environment variables or compose files; use Docker secrets or external secret managers.
Configure Restart Policies: Use
```
restart: unless-stopped
```
for production services to ensure automatic recovery from failures.
Implement Proper Logging: Configure structured logging with rotation and retention policies to manage disk space.
Use Read-Only Filesystems: Set
```
read_only: true
```
where possible and use tmpfs for temporary data to improve security.
Drop Unnecessary Capabilities: Use
```
cap_drop: ALL
```
and only add required capabilities to follow the principle of least privilege.
Enable Monitoring: Deploy monitoring and observability tools to track application health and performance metrics.
Implement Automated Backups: Configure regular automated backups with retention policies and test recovery procedures.
Use Internal Networks: Mark backend networks as internal to prevent direct external access to databases and caches.
Configure Update Strategies: Define update and rollback configurations for zero-downtime deployments.
Implement Resource Reservations: Set resource reservations to guarantee minimum resources for critical services.
Use Multi-Stage Dependencies: Configure
```
depends_on
```
with health check conditions to ensure proper startup order.
Document Configuration: Maintain comprehensive documentation of your production configuration and deployment procedures.

始终使用版本固定：固定具体的镜像版本，而非使用
```
latest
```
标签，确保部署的可重复性。
实施健康检查：为所有服务配置健康检查，实现自动恢复和正确的依赖管理。
设置资源限制：始终定义CPU和内存限制，防止资源耗尽，确保性能可预测。
使用密钥管理：切勿将密钥存储在环境变量或Compose文件中；使用Docker Secrets或外部密钥管理器。
配置重启策略：生产服务使用
```
restart: unless-stopped
```
，确保故障时自动恢复。
实施正确的日志配置：配置结构化日志及轮转和保留策略，管理磁盘空间。
使用只读文件系统：尽可能设置
```
read_only: true
```
，并使用tmpfs存储临时数据，提升安全性。
移除不必要的权限：使用
```
cap_drop: ALL
```
，仅添加必要的权限，遵循最小权限原则。
启用监控：部署监控和可观测性工具，跟踪应用健康状况和性能指标。
实施自动化备份：配置定期自动化备份及保留策略，并测试恢复流程。
使用内部网络：将后端网络标记为内部网络，防止数据库和缓存直接暴露到外部。
配置更新策略：定义更新和回滚配置，实现零停机部署。
实施资源预留：设置资源预留，为关键服务保证最低资源。
使用多阶段依赖：配置
```
depends_on
```
并结合健康检查条件，确保正确的启动顺序。
文档化配置：维护生产配置和部署流程的全面文档。

Common Pitfalls

常见陷阱

Using Latest Tags: Using
```
latest
```
or unversioned images can cause unexpected behavior when images are updated; always pin versions.
Ignoring Resource Limits: Not setting resource limits can allow one service to consume all available resources and crash others.
Missing Health Checks: Without health checks, Docker cannot determine if services are actually ready or need to be restarted.
Storing Secrets in Plain Text: Committing secrets to version control or storing them in environment variables exposes sensitive data.
Not Testing Backups: Creating backups without regularly testing restoration procedures leads to data loss during actual incidents.
Exposing Unnecessary Ports: Publishing all service ports to the host increases attack surface; only expose what's needed.
Running as Root: Not specifying a non-root user leaves containers vulnerable to privilege escalation attacks.
Ignoring Log Rotation: Without log rotation, logs can fill up disk space and crash services or hosts.
Missing Monitoring: Deploying without monitoring makes it impossible to detect and diagnose issues before they impact users.
Not Using Networks: Running all services on the default network prevents proper segmentation and increases security risk.
Forgetting Readiness Checks: Starting dependent services before dependencies are ready causes connection failures and restarts.
Hardcoding Configuration: Embedding environment-specific values in the compose file makes it difficult to deploy to multiple environments.
Neglecting Security Updates: Not regularly updating base images leaves services vulnerable to known security issues.
Insufficient Start Period: Setting health check start periods too short causes false positives during slow application startup.
Not Planning for Scale: Designing services without considering horizontal scaling makes it difficult to handle increased load.

使用Latest标签：使用
```
latest
```
或无版本镜像会在镜像更新时导致意外行为；始终固定版本。
忽略资源限制：不设置资源限制会导致单个服务消耗所有可用资源，导致其他服务崩溃。
缺少健康检查：没有健康检查，Docker无法判断服务是否真正就绪或需要重启。
明文存储密钥：将密钥提交到版本控制或存储在环境变量中会暴露敏感数据。
不测试备份：仅创建备份而不定期测试恢复流程，会在实际事故中导致数据丢失。
暴露不必要的端口：将所有服务端口发布到主机增加攻击面；仅暴露必要的端口。
以Root用户运行：不指定非Root用户会使容器面临权限提升攻击的风险。
忽略日志轮转：没有日志轮转，日志会占满磁盘空间，导致服务或主机崩溃。
缺少监控：部署时不配置监控，无法在影响用户前检测和诊断问题。
不使用网络：所有服务运行在默认网络上会导致无法正确分段，增加安全风险。
忘记就绪检查：在依赖项就绪前启动依赖服务会导致连接失败和重启。
硬编码配置：在Compose文件中嵌入环境特定值会导致难以部署到多环境。
忽略安全更新：不定期更新基础镜像会使服务面临已知安全漏洞的风险。
启动周期不足：健康检查启动周期设置过短会在应用启动缓慢时导致误报。
未规划扩展：设计服务时不考虑水平扩展会导致难以处理增长的负载。

docker-compose-production

Original

Translation

Docker Compose Production Deployment

Docker Compose 生产环境部署

Production-Ready Base Template

生产就绪基础模板

Security Hardening

安全加固

Resource Limits and Reservations

资源限制与预留

High Availability Configuration

高可用配置

Monitoring and Observability

监控与可观测性

Logging Configuration

日志配置

Environment Configuration Management

环境配置管理

Health Checks and Readiness

健康检查与就绪性

Backup and Recovery

备份与恢复

When to Use This Skill

何时使用本方案

Best Practices

最佳实践

Common Pitfalls

常见陷阱

Resources

资源

Official Documentation

官方文档

Deployment Guides

部署指南

Tools and Images

工具与镜像

Monitoring

监控