Loading...
Loading...
Grafana Cloud infrastructure monitoring — Kubernetes monitoring, cloud provider integrations (AWS, Azure, GCP), host and container monitoring, infrastructure dashboards, and collector setup. Use when setting up Kubernetes monitoring, connecting cloud provider metrics, configuring node exporter or cAdvisor, setting up infrastructure dashboards, or using the k8s-monitoring Helm chart.
npx skill4agent add grafana/skills infrastructurehelm repo add grafana https://grafana.github.io/helm-charts
helm repo update# values.yaml
cluster:
name: production-us-east
externalServices:
prometheus:
host: https://prometheus-prod-xx.grafana.net
basicAuth:
username: "123456"
password:
secretName: grafana-cloud-secret
secretKey: api-key
loki:
host: https://logs-prod-xx.grafana.net
basicAuth:
username: "234567"
password:
secretName: grafana-cloud-secret
secretKey: api-key
tempo:
host: https://tempo-prod-xx.grafana.net:443
basicAuth:
username: "345678"
password:
secretName: grafana-cloud-secret
secretKey: api-key
metrics:
enabled: true
cost:
enabled: true # Kubernetes cost monitoring
podMonitors:
enabled: true
serviceMonitors:
enabled: true
kube-state-metrics:
enabled: true
node-exporter:
enabled: true
cadvisor:
enabled: true
logs:
pod_logs:
enabled: true
cluster_events:
enabled: true
traces:
enabled: true
profiles:
enabled: false
receivers:
grpc:
enabled: true
port: 4317
http:
enabled: true
port: 4318kubectl create secret generic grafana-cloud-secret \
--from-literal=api-key=<your-api-key> \
-n monitoring
helm install k8s-monitoring grafana/k8s-monitoring \
-n monitoring --create-namespace \
-f values.yaml# CPU usage by pod
sum(rate(container_cpu_usage_seconds_total{
namespace="$namespace", container!=""}[5m])) by (pod)
# Memory usage by pod
sum(container_memory_working_set_bytes{
namespace="$namespace", container!=""}) by (pod)
# Node CPU pressure
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
# Pod restarts
increase(kube_pod_container_status_restarts_total[1h])
# Deployment readiness
kube_deployment_status_replicas_ready / kube_deployment_spec_replicas
# PVC usage
kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes# Alloy config for AWS CloudWatch scraping
prometheus.scrape "cloudwatch" {
targets = [{__address__ = "cloudwatch-exporter:9106"}]
forward_to = [prometheus.remote_write.cloud.receiver]
}# provisioning/datasources/cloudwatch.yaml
apiVersion: 1
datasources:
- name: CloudWatch
type: cloudwatch
jsonData:
defaultRegion: us-east-1
authType: default # uses EC2 instance role / ECS task role
# Or explicit credentials:
# authType: credentials
secureJsonData:
accessKey: AKIAIOSFODNN7EXAMPLE
secretKey: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY# provisioning/datasources/azure.yaml
apiVersion: 1
datasources:
- name: Azure Monitor
type: grafana-azure-monitor-datasource
jsonData:
cloudName: AzureCloud
tenantId: your-tenant-id
clientId: your-client-id
secureJsonData:
clientSecret: your-client-secret# provisioning/datasources/google.yaml
apiVersion: 1
datasources:
- name: Google Cloud Monitoring
type: stackdriver
jsonData:
authenticationType: gce # uses GCE metadata server
# Or JWT:
# authenticationType: jwt
secureJsonData:
privateKey: |
{ "type": "service_account", ... }// Alloy config for Linux host metrics
prometheus.exporter.unix "host" {
rootfs_path = "/"
enable_collectors = ["cpu", "diskstats", "filesystem", "loadavg", "meminfo", "netdev", "stat", "time", "uname"]
}
prometheus.scrape "node" {
targets = prometheus.exporter.unix.host.targets
forward_to = [prometheus.remote_write.cloud.receiver]
scrape_interval = "60s"
}// cAdvisor metrics via Alloy
prometheus.scrape "cadvisor" {
targets = [{"__address__" = "localhost:8080"}]
metrics_path = "/metrics"
forward_to = [prometheus.remote_write.cloud.receiver]
}
// Docker container logs
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.containers.targets
forward_to = [loki.write.cloud.receiver]
}
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
}# Common infrastructure alert rules
groups:
- name: kubernetes-alerts
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash looping"
- alert: NodeMemoryPressure
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} low memory (<10% free)"
- alert: PersistentVolumeAlmostFull
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} almost full"