Loading...
Loading...
Elite Talos Linux expert specializing in immutable Kubernetes OS, secure cluster deployment, machine configurations, talosctl CLI operations, upgrades, and production-grade security hardening. Expert in Talos 1.6+, secure boot, disk encryption, and zero-trust infrastructure. Use when deploying Talos clusters, configuring machine configs, troubleshooting node issues, or implementing security best practices.
npx skill4agent add martinholovsky/claude-skills-generator talos-os-expert#!/bin/bash
# tests/validate-config.sh
set -e
# Test 1: Validate machine config schema
echo "Testing: Machine config validation..."
talosctl validate --config controlplane.yaml --mode metal
talosctl validate --config worker.yaml --mode metal
# Test 2: Verify required fields exist
echo "Testing: Required fields..."
yq '.machine.install.disk' controlplane.yaml | grep -q '/dev/'
yq '.cluster.network.podSubnets' controlplane.yaml | grep -q '10.244'
# Test 3: Security requirements
echo "Testing: Security configuration..."
yq '.machine.systemDiskEncryption.state.provider' controlplane.yaml | grep -q 'luks2'
echo "All validation tests passed!"# controlplane.yaml - Minimum viable configuration
machine:
type: controlplane
install:
disk: /dev/sda
image: ghcr.io/siderolabs/installer:v1.6.0
network:
hostname: cp-01
interfaces:
- interface: eth0
dhcp: true
systemDiskEncryption:
state:
provider: luks2
keys:
- slot: 0
tpm: {}
cluster:
network:
podSubnets:
- 10.244.0.0/16
serviceSubnets:
- 10.96.0.0/12#!/bin/bash
# tests/health-check.sh
set -e
NODES="10.0.1.10,10.0.1.11,10.0.1.12"
# Test cluster health
echo "Testing: Cluster health..."
talosctl -n $NODES health --wait-timeout=5m
# Test etcd health
echo "Testing: etcd cluster..."
talosctl -n 10.0.1.10 etcd members
talosctl -n 10.0.1.10 etcd status
# Test Kubernetes components
echo "Testing: Kubernetes nodes..."
kubectl get nodes --no-headers | grep -c "Ready" | grep -q "3"
# Test all pods running
echo "Testing: System pods..."
kubectl get pods -n kube-system --no-headers | grep -v "Running\|Completed" && exit 1 || true
echo "All health checks passed!"#!/bin/bash
# tests/security-compliance.sh
set -e
NODE="10.0.1.10"
# Test disk encryption
echo "Testing: Disk encryption enabled..."
talosctl -n $NODE get disks -o yaml | grep -q 'encrypted: true'
# Test services are minimal
echo "Testing: Minimal services running..."
SERVICES=$(talosctl -n $NODE services | grep -c "Running")
if [ "$SERVICES" -gt 10 ]; then
echo "ERROR: Too many services running ($SERVICES)"
exit 1
fi
# Test no unauthorized mounts
echo "Testing: Mount points..."
talosctl -n $NODE mounts | grep -v '/dev/\|/sys/\|/proc/' | grep -q 'rw' && exit 1 || true
echo "All security compliance tests passed!"#!/bin/bash
# tests/full-verification.sh
# Run all test suites
./tests/validate-config.sh
./tests/health-check.sh
./tests/security-compliance.sh
# Verify etcd snapshot capability
echo "Testing: etcd snapshot..."
talosctl -n 10.0.1.10 etcd snapshot ./etcd-backup-test.snapshot
rm ./etcd-backup-test.snapshot
# Verify upgrade capability (dry-run)
echo "Testing: Upgrade dry-run..."
talosctl -n 10.0.1.10 upgrade --dry-run \
--image ghcr.io/siderolabs/installer:v1.6.1
echo "Full verification complete - ready for production!"talosctl gen configtalosctl logstalosctl healthtalosctl dmesgtalosctl interfacestalosctl routestalosctl etcd memberstalosctl etcd status# Generate cluster configuration with 3 control plane nodes
talosctl gen config talos-prod-cluster https://10.0.1.10:6443 \
--with-secrets secrets.yaml \
--config-patch-control-plane @control-plane-patch.yaml \
--config-patch-worker @worker-patch.yaml
# Apply configuration to first control plane node
talosctl apply-config --insecure \
--nodes 10.0.1.10 \
--file controlplane.yaml
# Bootstrap etcd on first control plane
talosctl bootstrap --nodes 10.0.1.10 \
--endpoints 10.0.1.10 \
--talosconfig=./talosconfig
# Apply to additional control plane nodes
talosctl apply-config --insecure --nodes 10.0.1.11 --file controlplane.yaml
talosctl apply-config --insecure --nodes 10.0.1.12 --file controlplane.yaml
# Verify etcd cluster health
talosctl -n 10.0.1.10,10.0.1.11,10.0.1.12 etcd members
# Apply to worker nodes
for node in 10.0.1.20 10.0.1.21 10.0.1.22; do
talosctl apply-config --insecure --nodes $node --file worker.yaml
done
# Bootstrap Kubernetes and retrieve kubeconfig
talosctl kubeconfig --nodes 10.0.1.10 --force
# Verify cluster
kubectl get nodes
kubectl get pods -A--with-secrets# control-plane-patch.yaml
machine:
network:
hostname: cp-01
interfaces:
- interface: eth0
dhcp: false
addresses:
- 10.0.1.10/24
routes:
- network: 0.0.0.0/0
gateway: 10.0.1.1
vip:
ip: 10.0.1.100 # Virtual IP for control plane HA
- interface: eth1
dhcp: false
addresses:
- 192.168.1.10/24 # Management network
nameservers:
- 8.8.8.8
- 1.1.1.1
timeServers:
- time.cloudflare.com
install:
disk: /dev/sda
image: ghcr.io/siderolabs/installer:v1.6.0
wipe: false
kubelet:
extraArgs:
feature-gates: GracefulNodeShutdown=true
rotate-server-certificates: true
nodeIP:
validSubnets:
- 10.0.1.0/24 # Force kubelet to use cluster network
files:
- content: |
[plugins."io.containerd.grpc.v1.cri"]
enable_unprivileged_ports = true
path: /etc/cri/conf.d/20-customization.part
op: create
cluster:
network:
cni:
name: none # Will install Cilium manually
dnsDomain: cluster.local
podSubnets:
- 10.244.0.0/16
serviceSubnets:
- 10.96.0.0/12
apiServer:
certSANs:
- 10.0.1.100
- cp.talos.example.com
extraArgs:
audit-log-path: /var/log/kube-apiserver-audit.log
audit-policy-file: /etc/kubernetes/audit-policy.yaml
feature-gates: ServerSideApply=true
controllerManager:
extraArgs:
bind-address: 0.0.0.0
scheduler:
extraArgs:
bind-address: 0.0.0.0
etcd:
extraArgs:
listen-metrics-urls: http://0.0.0.0:2381# Merge patch with base config
talosctl gen config talos-prod https://10.0.1.100:6443 \
--config-patch-control-plane @control-plane-patch.yaml \
--output-types controlplane -o controlplane.yaml
# Apply to node
talosctl apply-config --nodes 10.0.1.10 --file controlplane.yaml# Check current version
talosctl -n 10.0.1.10 version
# Plan upgrade (check what will change)
talosctl -n 10.0.1.10 upgrade --dry-run \
--image ghcr.io/siderolabs/installer:v1.6.1
# Upgrade control plane nodes one at a time
for node in 10.0.1.10 10.0.1.11 10.0.1.12; do
echo "Upgrading control plane node $node"
# Upgrade with preserve=true (keeps ephemeral data)
talosctl -n $node upgrade \
--image ghcr.io/siderolabs/installer:v1.6.1 \
--preserve=true \
--wait
# Wait for node to be ready
kubectl wait --for=condition=Ready node/$node --timeout=10m
# Verify etcd health
talosctl -n $node etcd member list
# Brief pause before next node
sleep 30
done
# Upgrade worker nodes (can be done in parallel batches)
talosctl -n 10.0.1.20,10.0.1.21,10.0.1.22 upgrade \
--image ghcr.io/siderolabs/installer:v1.6.1 \
--preserve=true
# Verify cluster health
kubectl get nodes
talosctl -n 10.0.1.10 health --wait-timeout=10m--preserve=true# disk-encryption-patch.yaml
machine:
install:
disk: /dev/sda
wipe: true
diskSelector:
size: '>= 100GB'
model: 'Samsung SSD*'
systemDiskEncryption:
state:
provider: luks2
keys:
- slot: 0
tpm: {} # Use TPM 2.0 for key sealing
options:
- no_read_workqueue
- no_write_workqueue
ephemeral:
provider: luks2
keys:
- slot: 0
tpm: {}
cipher: aes-xts-plain64
keySize: 512
options:
- no_read_workqueue
- no_write_workqueue
# For non-TPM environments, use static key
# machine:
# systemDiskEncryption:
# state:
# provider: luks2
# keys:
# - slot: 0
# static:
# passphrase: "your-secure-passphrase-from-vault"# Generate config with encryption patch
talosctl gen config encrypted-cluster https://10.0.1.100:6443 \
--config-patch-control-plane @disk-encryption-patch.yaml \
--with-secrets secrets.yaml
# WARNING: This will wipe the disk during installation
talosctl apply-config --insecure --nodes 10.0.1.10 --file controlplane.yaml
# Verify encryption is active
talosctl -n 10.0.1.10 get encryptionconfig
talosctl -n 10.0.1.10 disks# Generate configs for multiple clusters
talosctl gen config prod-us-east https://prod-us-east.example.com:6443 \
--with-secrets secrets-prod-us-east.yaml \
--output-types talosconfig \
-o talosconfig-prod-us-east
talosctl gen config prod-eu-west https://prod-eu-west.example.com:6443 \
--with-secrets secrets-prod-eu-west.yaml \
--output-types talosconfig \
-o talosconfig-prod-eu-west
# Merge contexts into single config
talosctl config merge talosconfig-prod-us-east
talosctl config merge talosconfig-prod-eu-west
# List available contexts
talosctl config contexts
# Switch between clusters
talosctl config context prod-us-east
talosctl -n 10.0.1.10 version
talosctl config context prod-eu-west
talosctl -n 10.10.1.10 version
# Use specific context without switching
talosctl --context prod-us-east -n 10.0.1.10 get members# Check node health comprehensively
talosctl -n 10.0.1.10 health --server=false
# View system logs
talosctl -n 10.0.1.10 dmesg --tail
talosctl -n 10.0.1.10 logs kubelet
talosctl -n 10.0.1.10 logs etcd
talosctl -n 10.0.1.10 logs containerd
# Check service status
talosctl -n 10.0.1.10 services
talosctl -n 10.0.1.10 service kubelet status
talosctl -n 10.0.1.10 service etcd status
# Network diagnostics
talosctl -n 10.0.1.10 interfaces
talosctl -n 10.0.1.10 routes
talosctl -n 10.0.1.10 netstat --tcp --listening
# Disk and mount information
talosctl -n 10.0.1.10 disks
talosctl -n 10.0.1.10 mounts
# etcd diagnostics
talosctl -n 10.0.1.10 etcd members
talosctl -n 10.0.1.10 etcd status
talosctl -n 10.0.1.10 etcd alarm list
# Get machine configuration currently applied
talosctl -n 10.0.1.10 get machineconfig -o yaml
# Reset node (DESTRUCTIVE - use with caution)
# talosctl -n 10.0.1.10 reset --graceful --reboot
# Force reboot if node is unresponsive
# talosctl -n 10.0.1.10 reboot --mode=force# .github/workflows/talos-apply.yml
name: Apply Talos Machine Configs
on:
push:
branches: [main]
paths:
- 'talos/clusters/**/*.yaml'
pull_request:
paths:
- 'talos/clusters/**/*.yaml'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install talosctl
run: |
curl -sL https://talos.dev/install | sh
- name: Validate machine configs
run: |
talosctl validate --config talos/clusters/prod/controlplane.yaml --mode metal
talosctl validate --config talos/clusters/prod/worker.yaml --mode metal
apply-staging:
needs: validate
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: staging
steps:
- uses: actions/checkout@v4
- name: Configure talosctl
run: |
echo "${{ secrets.TALOS_CONFIG_STAGING }}" > /tmp/talosconfig
export TALOSCONFIG=/tmp/talosconfig
- name: Apply control plane config
run: |
talosctl apply-config \
--nodes 10.0.1.10,10.0.1.11,10.0.1.12 \
--file talos/clusters/staging/controlplane.yaml \
--mode=reboot
- name: Wait for nodes
run: |
sleep 60
talosctl -n 10.0.1.10 health --wait-timeout=10m
apply-production:
needs: apply-staging
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Apply production configs
run: |
# Apply to control plane with rolling update
for node in 10.1.1.10 10.1.1.11 10.1.1.12; do
talosctl apply-config --nodes $node \
--file talos/clusters/prod/controlplane.yaml \
--mode=reboot
sleep 120 # Wait between control plane nodes
donemachine:
install:
disk: /dev/sda
image: ghcr.io/siderolabs/installer:v1.6.0
# Use specific version, not latest
wipe: false # Preserve data on upgrades
# Pre-pull system extension images
registries:
mirrors:
docker.io:
endpoints:
- https://registry-mirror.example.com # Local mirror
ghcr.io:
endpoints:
- https://ghcr-mirror.example.com
config:
registry-mirror.example.com:
tls:
insecureSkipVerify: false # Always verify TLSmachine:
install:
disk: /dev/sda
image: ghcr.io/siderolabs/installer:latest # Don't use latest
wipe: true # Unnecessary data loss on every change
# No registry mirrors - slow pulls from internetcluster:
etcd:
extraArgs:
quota-backend-bytes: "8589934592" # 8GB quota
auto-compaction-retention: "1000" # Keep 1000 revisions
snapshot-count: "10000" # Snapshot every 10k txns
heartbeat-interval: "100" # 100ms heartbeat
election-timeout: "1000" # 1s election timeout
max-snapshots: "5" # Keep 5 snapshots
max-wals: "5" # Keep 5 WAL files
machine:
kubelet:
extraArgs:
kube-reserved: cpu=200m,memory=512Mi
system-reserved: cpu=200m,memory=512Mi
eviction-hard: memory.available<500Mi,nodefs.available<10%
image-gc-high-threshold: "85"
image-gc-low-threshold: "80"
max-pods: "110"cluster:
etcd: {} # No quotas - can fill disk
machine:
kubelet: {} # No reservations - system can OOMmachine:
sysctls:
# Network performance
net.core.somaxconn: "32768"
net.core.netdev_max_backlog: "16384"
net.ipv4.tcp_max_syn_backlog: "8192"
net.ipv4.tcp_slow_start_after_idle: "0"
net.ipv4.tcp_tw_reuse: "1"
# Memory management
vm.swappiness: "0" # Disable swap
vm.overcommit_memory: "1" # Allow overcommit
vm.panic_on_oom: "0" # Don't panic on OOM
# File descriptors
fs.file-max: "2097152"
fs.inotify.max_user_watches: "1048576"
fs.inotify.max_user_instances: "8192"
# Conntrack for high connection counts
net.netfilter.nf_conntrack_max: "1048576"
net.nf_conntrack_max: "1048576"
# CPU scheduler optimization
kernel:
modules:
- name: br_netfilter
- name: overlaymachine:
sysctls: {} # Default limits may cause connection drops
# Missing required kernel modulesmachine:
install:
disk: /dev/sda
diskSelector:
size: '>= 120GB'
type: ssd # Prefer SSD for etcd
model: 'Samsung*' # Target specific hardware
# Encryption with performance options
systemDiskEncryption:
state:
provider: luks2
keys:
- slot: 0
tpm: {}
options:
- no_read_workqueue # Improve read performance
- no_write_workqueue # Improve write performance
ephemeral:
provider: luks2
keys:
- slot: 0
tpm: {}
cipher: aes-xts-plain64
keySize: 256 # Balance security/performance
options:
- no_read_workqueue
- no_write_workqueue
# Configure disks for data workloads
disks:
- device: /dev/sdb
partitions:
- mountpoint: /var/lib/longhorn
size: 0 # Use all remaining spacemachine:
install:
disk: /dev/sda # No selector - might use slow HDD
wipe: true # Data loss risk
systemDiskEncryption:
state:
provider: luks2
cipher: aes-xts-plain64
keySize: 512 # Slower than necessary
# Missing performance optionsmachine:
network:
interfaces:
- interface: eth0
dhcp: false
addresses:
- 10.0.1.10/24
mtu: 9000 # Jumbo frames for cluster traffic
routes:
- network: 0.0.0.0/0
gateway: 10.0.1.1
metric: 100
# Use performant DNS
nameservers:
- 10.0.1.1 # Local DNS resolver
- 1.1.1.1 # Cloudflare as backup
cluster:
network:
cni:
name: none # Install optimized CNI separately
podSubnets:
- 10.244.0.0/16
serviceSubnets:
- 10.96.0.0/12
proxy:
mode: ipvs # Better performance than iptables
extraArgs:
ipvs-scheduler: lc # Least connectionsmachine:
network:
interfaces:
- interface: eth0
dhcp: true # Less predictable
# No MTU optimization
cluster:
proxy:
mode: iptables # Slower for large clusters#!/bin/bash
# tests/talos-config-tests.sh
# Validate all machine configs
validate_configs() {
for config in controlplane.yaml worker.yaml; do
echo "Validating $config..."
talosctl validate --config $config --mode metal || exit 1
done
}
# Test config generation is reproducible
test_reproducibility() {
talosctl gen config test-cluster https://10.0.1.100:6443 \
--with-secrets secrets.yaml \
--output-dir /tmp/gen1
talosctl gen config test-cluster https://10.0.1.100:6443 \
--with-secrets secrets.yaml \
--output-dir /tmp/gen2
# Configs should be identical (except timestamps)
diff <(yq 'del(.machine.time)' /tmp/gen1/controlplane.yaml) \
<(yq 'del(.machine.time)' /tmp/gen2/controlplane.yaml)
}
# Test secrets are properly encrypted
test_secrets_encryption() {
# Verify secrets file doesn't contain plaintext
if grep -q "BEGIN RSA PRIVATE KEY" secrets.yaml; then
echo "ERROR: Unencrypted secrets detected!"
exit 1
fi
}#!/bin/bash
# tests/cluster-health-tests.sh
# Test all nodes are ready
test_nodes_ready() {
local expected_nodes=$1
local ready_nodes=$(kubectl get nodes --no-headers | grep -c "Ready")
if [ "$ready_nodes" -ne "$expected_nodes" ]; then
echo "ERROR: Expected $expected_nodes nodes, got $ready_nodes"
kubectl get nodes
exit 1
fi
}
# Test etcd cluster health
test_etcd_health() {
local nodes=$1
# Check all members present
local members=$(talosctl -n $nodes etcd members | grep -c "started")
if [ "$members" -ne 3 ]; then
echo "ERROR: Expected 3 etcd members, got $members"
exit 1
fi
# Check no alarms
local alarms=$(talosctl -n $nodes etcd alarm list 2>&1)
if [[ "$alarms" != *"no alarms"* ]]; then
echo "ERROR: etcd alarms detected: $alarms"
exit 1
fi
}
# Test critical system pods
test_system_pods() {
local failing=$(kubectl get pods -n kube-system --no-headers | \
grep -v "Running\|Completed" | wc -l)
if [ "$failing" -gt 0 ]; then
echo "ERROR: $failing system pods not running"
kubectl get pods -n kube-system | grep -v "Running\|Completed"
exit 1
fi
}#!/bin/bash
# tests/upgrade-tests.sh
# Test upgrade dry-run
test_upgrade_dry_run() {
local node=$1
local new_image=$2
echo "Testing upgrade dry-run to $new_image..."
talosctl -n $node upgrade --dry-run --image $new_image || exit 1
}
# Test rollback capability
test_rollback_preparation() {
local node=$1
# Ensure we have previous image info
local current=$(talosctl -n $node version --short | grep "Tag:" | awk '{print $2}')
echo "Current version: $current"
# Verify etcd snapshot exists
talosctl -n $node etcd snapshot /tmp/pre-upgrade-backup.snapshot || exit 1
echo "Backup created successfully"
}
# Full upgrade test (for staging)
test_full_upgrade() {
local node=$1
local new_image=$2
# 1. Create backup
talosctl -n $node etcd snapshot /tmp/upgrade-backup.snapshot
# 2. Perform upgrade
talosctl -n $node upgrade --image $new_image --preserve=true --wait
# 3. Wait for node ready
kubectl wait --for=condition=Ready node/$node --timeout=10m
# 4. Verify health
talosctl -n $node health --wait-timeout=5m
}#!/bin/bash
# tests/security-tests.sh
# Test disk encryption
test_disk_encryption() {
local node=$1
local encrypted=$(talosctl -n $node get disks -o yaml | grep -c 'encrypted: true')
if [ "$encrypted" -lt 1 ]; then
echo "ERROR: Disk encryption not enabled on $node"
exit 1
fi
}
# Test minimal services
test_minimal_services() {
local node=$1
local max_services=10
local running=$(talosctl -n $node services | grep -c "Running")
if [ "$running" -gt "$max_services" ]; then
echo "ERROR: Too many services ($running > $max_services) on $node"
talosctl -n $node services
exit 1
fi
}
# Test API access restrictions
test_api_access() {
local node=$1
# Should not be accessible from public internet
# This test assumes you're running from inside the network
timeout 5 talosctl -n $node version > /dev/null || {
echo "ERROR: Cannot access Talos API on $node"
exit 1
}
}
# Run all security tests
run_security_suite() {
local nodes="10.0.1.10 10.0.1.11 10.0.1.12"
for node in $nodes; do
echo "Running security tests on $node..."
test_disk_encryption $node
test_minimal_services $node
test_api_access $node
done
echo "All security tests passed!"
}# Restrict Talos API access with certificates
machine:
certSANs:
- talos-api.example.com
features:
rbac: true # Enable RBAC for Talos API (v1.6+)
# Only authorized talosconfig files can access cluster
# Rotate certificates regularly
talosctl config add prod-cluster \
--ca /path/to/ca.crt \
--crt /path/to/admin.crt \
--key /path/to/admin.keymachine:
systemDiskEncryption:
# Encrypt state partition (etcd, machine config)
state:
provider: luks2
keys:
- slot: 0
tpm: {} # TPM 2.0 sealed key
- slot: 1
static:
passphrase: "recovery-key-from-vault" # Fallback
# Encrypt ephemeral partition (container images, logs)
ephemeral:
provider: luks2
keys:
- slot: 0
tpm: {}machine:
install:
disk: /dev/sda
features:
apidCheckExtKeyUsage: true
# Custom secure boot certificates
secureboot:
enrollKeys:
- /path/to/PK.auth
- /path/to/KEK.auth
- /path/to/db.authtalosctl dmesg | grep securebootcluster:
secretboxEncryptionSecret: "base64-encoded-32-byte-key"
# Or use external KMS
apiServer:
extraArgs:
encryption-provider-config: /etc/kubernetes/encryption-config.yaml
extraVolumes:
- name: encryption-config
hostPath: /var/lib/kubernetes/encryption-config.yaml
mountPath: /etc/kubernetes/encryption-config.yaml
readonly: true
machine:
files:
- path: /var/lib/kubernetes/encryption-config.yaml
permissions: 0600
content: |
apiVersion: apiserver.config.k8s.io/v1
kind: EncryptionConfiguration
resources:
- resources:
- secrets
providers:
- aescbc:
keys:
- name: key1
secret: <base64-encoded-secret>
- identity: {}cluster:
network:
cni:
name: custom
urls:
- https://raw.githubusercontent.com/cilium/cilium/v1.14/install/kubernetes/quick-install.yaml
# Pod and service network isolation
podSubnets:
- 10.244.0.0/16
serviceSubnets:
- 10.96.0.0/12
machine:
network:
# Separate management and cluster networks
interfaces:
- interface: eth0
addresses:
- 10.0.1.10/24 # Cluster network
- interface: eth1
addresses:
- 192.168.1.10/24 # Management network (Talos API)# ❌ BAD: Running bootstrap on multiple control plane nodes
talosctl bootstrap --nodes 10.0.1.10
talosctl bootstrap --nodes 10.0.1.11 # This will create a split-brain!
# ✅ GOOD: Bootstrap only once on first control plane
talosctl bootstrap --nodes 10.0.1.10
# Other nodes join automatically via machine config# ❌ BAD: Not saving secrets during generation
talosctl gen config my-cluster https://10.0.1.100:6443
# ✅ GOOD: Always save secrets for future operations
talosctl gen config my-cluster https://10.0.1.100:6443 \
--with-secrets secrets.yaml
# Store secrets.yaml in encrypted vault (age, SOPS, Vault)
age-encrypt -r <public-key> secrets.yaml > secrets.yaml.age# ❌ BAD: Upgrading all control plane at once
talosctl -n 10.0.1.10,10.0.1.11,10.0.1.12 upgrade --image ghcr.io/siderolabs/installer:v1.6.1
# ✅ GOOD: Sequential upgrade with validation
for node in 10.0.1.10 10.0.1.11 10.0.1.12; do
talosctl -n $node upgrade --image ghcr.io/siderolabs/installer:v1.6.1 --wait
kubectl wait --for=condition=Ready node/$node --timeout=10m
sleep 30
done--mode=staged# ❌ RISKY: Using staged mode without plan
talosctl apply-config --nodes 10.0.1.10 --file config.yaml --mode=staged
# ✅ BETTER: Understand mode implications
# - auto (default): Applies immediately, reboots if needed
# - no-reboot: Applies without reboot (use for config changes that don't require reboot)
# - reboot: Always reboots to apply changes
# - staged: Applies on next reboot (use for planned maintenance windows)
talosctl apply-config --nodes 10.0.1.10 --file config.yaml --mode=no-reboot
# Then manually reboot when ready
talosctl -n 10.0.1.10 reboot# ❌ BAD: Applying config without validation
talosctl apply-config --nodes 10.0.1.10 --file config.yaml
# ✅ GOOD: Validate first
talosctl validate --config config.yaml --mode metal
# Check what will change
talosctl -n 10.0.1.10 get machineconfig -o yaml > current-config.yaml
diff current-config.yaml config.yaml
# Then apply
talosctl apply-config --nodes 10.0.1.10 --file config.yaml# ❌ BAD: Using small root disk without etcd quota
machine:
install:
disk: /dev/sda # Only 32GB disk
# ✅ GOOD: Proper disk sizing and etcd quota
machine:
install:
disk: /dev/sda # Minimum 120GB recommended
kubelet:
extraArgs:
eviction-hard: nodefs.available<10%,nodefs.inodesFree<5%
cluster:
etcd:
extraArgs:
quota-backend-bytes: "8589934592" # 8GB quota
auto-compaction-retention: "1000"
snapshot-count: "10000"# ❌ DANGEROUS: Talos API accessible from anywhere
machine:
network:
interfaces:
- interface: eth0
addresses:
- 203.0.113.10/24 # Public IP
# Talos API (50000) now exposed to internet!
# ✅ GOOD: Separate networks for management and cluster
machine:
network:
interfaces:
- interface: eth0
addresses:
- 10.0.1.10/24 # Private cluster network
- interface: eth1
addresses:
- 192.168.1.10/24 # Management network (firewalled)# ❌ BAD: Upgrading production directly
talosctl -n prod-node upgrade --image ghcr.io/siderolabs/installer:v1.7.0
# ✅ GOOD: Test upgrade path
# 1. Upgrade staging environment
talosctl --context staging -n staging-node upgrade --image ghcr.io/siderolabs/installer:v1.7.0
# 2. Verify staging cluster health
kubectl --context staging get nodes
kubectl --context staging get pods -A
# 3. Run integration tests
# 4. Document any issues or manual steps required
# 5. Only then upgrade production with documented procedure--with-secretstalosctl validate --mode metaltalosctl health# Run complete verification suite
./tests/validate-config.sh
./tests/health-check.sh
./tests/security-compliance.sh
# Verify cluster state
talosctl -n <nodes> health --wait-timeout=5m
talosctl -n <nodes> etcd members
kubectl get nodes
kubectl get pods -A
# Create production backup
talosctl -n <control-plane> etcd snapshot ./pre-production-backup.snapshotsecrets.yamltalosctl validate--preserve=truetalosctl healthtalosctl logs <service>talosctl dmesg