Loading...
Loading...
Systematic debugging workflows for Kubernetes issues including pod failures, resource problems, and networking. Use when debugging CrashLoopBackOff, OOMKilled, ImagePullBackOff, pod not starting, k8s issues, or any Kubernetes troubleshooting.
npx skill4agent add nik-kale/sre-skills kubernetes-troubleshooting# Cluster overview
kubectl get nodes
kubectl get pods -A | grep -v Running
# Specific namespace
kubectl get pods -n <namespace>
kubectl get events -n <namespace> --sort-by='.lastTimestamp' | tail -20
# Resource usage
kubectl top nodes
kubectl top pods -n <namespace>kubectl get pod <pod-name> -n <namespace> -o wide
kubectl describe pod <pod-name> -n <namespace>| Symptom | Likely Cause | Go To Section |
|---|---|---|
| Pending | Scheduling issue | Scheduling Issues |
| CrashLoopBackOff | Application crash | CrashLoopBackOff |
| ImagePullBackOff | Image/registry issue | Image Pull Issues |
| OOMKilled | Memory exhaustion | OOMKilled |
| Running but not Ready | Health check failing | Readiness Issues |
| Error | Container error | Container Errors |
kubectl describe pod <pod-name> -n <namespace> | grep -A 10 Events| Event Message | Cause | Fix |
|---|---|---|
| Insufficient cpu/memory | Not enough resources | Add nodes or reduce requests |
| node(s) had taints | Node taints | Add tolerations or remove taints |
| no nodes available | No matching nodes | Check node selector/affinity |
| persistentvolumeclaim not found | PVC missing | Create the PVC |
# Check resource requests vs available
kubectl describe nodes | grep -A 5 "Allocated resources"
# Check pending pod requests
kubectl get pod <pod> -o yaml | grep -A 10 resources# Check container logs (current)
kubectl logs <pod-name> -n <namespace>
# Check previous container logs
kubectl logs <pod-name> -n <namespace> --previous
# Check exit code
kubectl describe pod <pod-name> -n <namespace> | grep -A 3 "Last State"| Exit Code | Meaning | Common Cause |
|---|---|---|
| 0 | Success | Process completed (might be wrong for long-running) |
| 1 | Application error | Check application logs |
| 137 | SIGKILL (OOM) | Memory limit exceeded |
| 139 | SIGSEGV | Segmentation fault |
| 143 | SIGTERM | Graceful termination |
kubectl describe pod <pod-name> -n <namespace> | grep -A 5 Events| Error | Cause | Fix |
|---|---|---|
| repository does not exist | Wrong image name | Fix image name/tag |
| unauthorized | Auth failure | Check imagePullSecrets |
| manifest unknown | Tag doesn't exist | Verify tag exists |
| connection refused | Registry unreachable | Check network/firewall |
# Create image pull secret
kubectl create secret docker-registry regcred \
--docker-server=<registry> \
--docker-username=<user> \
--docker-password=<password> \
-n <namespace>
# Reference in pod spec
spec:
imagePullSecrets:
- name: regcredkubectl describe pod <pod-name> -n <namespace> | grep -i oom
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 5 lastStateresources:
limits:
memory: '512Mi' # Increase this
requests:
memory: '256Mi'kubectl top pod <pod-name> -n <namespace> --containers# Check readiness probe
kubectl describe pod <pod-name> -n <namespace> | grep -A 10 Readiness
# Check probe endpoint manually
kubectl exec <pod-name> -n <namespace> -- wget -qO- localhost:<port>/healthreadinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 10 # Give app time to start
periodSeconds: 5
timeoutSeconds: 3 # Increase if needed
failureThreshold: 3# Get detailed container status
kubectl get pod <pod-name> -n <namespace> -o jsonpath='{.status.containerStatuses[*]}'
# Check init containers
kubectl logs <pod-name> -n <namespace> -c <init-container-name># Check service endpoints
kubectl get endpoints <service-name> -n <namespace>
# Check service selector matches pod labels
kubectl get svc <service-name> -n <namespace> -o yaml | grep selector -A 5
kubectl get pods -n <namespace> --show-labels
# Test connectivity from another pod
kubectl run debug --rm -it --image=busybox -- wget -qO- <service>:<port># Check DNS resolution from pod
kubectl exec <pod> -n <namespace> -- nslookup <service-name>
kubectl exec <pod> -n <namespace> -- nslookup <service-name>.<namespace>.svc.cluster.local
# Check CoreDNS is running
kubectl get pods -n kube-system -l k8s-app=kube-dns# Check node conditions
kubectl describe nodes | grep -A 5 Conditions
# Check node resource usage
kubectl top nodes
# Find resource-heavy pods
kubectl top pods -A --sort-by=memory | head -20# Check PVC status
kubectl get pvc -n <namespace>
# Check PV status
kubectl get pv
# Describe for events
kubectl describe pvc <pvc-name> -n <namespace># Pod debugging
kubectl logs <pod> -n <ns> # Current logs
kubectl logs <pod> -n <ns> --previous # Previous container logs
kubectl logs <pod> -n <ns> -c <container> # Specific container
kubectl logs <pod> -n <ns> --tail=100 -f # Follow logs
# Interactive debugging
kubectl exec -it <pod> -n <ns> -- /bin/sh # Shell into container
kubectl exec <pod> -n <ns> -- env # Check environment
kubectl exec <pod> -n <ns> -- cat /etc/hosts # Check DNS
# Resource inspection
kubectl get pod <pod> -n <ns> -o yaml # Full pod spec
kubectl describe pod <pod> -n <ns> # Events and status
kubectl get events -n <ns> --sort-by='.lastTimestamp'
# Cluster-wide
kubectl get pods -A | grep -v Running # Non-running pods
kubectl top pods -A --sort-by=cpu # CPU usage
kubectl top pods -A --sort-by=memory # Memory usage