Loading...
Loading...
Guide for implementing Grafana Mimir - a horizontally scalable, highly available, multi-tenant TSDB for long-term storage of Prometheus metrics. Use when configuring Mimir on Kubernetes, setting up Azure/S3/GCS storage backends, troubleshooting authentication issues, or optimizing performance.
npx skill4agent add julianobarbosa/claude-code-skills mimirX-Scope-OrgID| Component | Purpose |
|---|---|
| Distributor | Validates requests, routes incoming metrics to ingesters via hash ring |
| Ingester | Stores time-series data in memory, flushes to object storage |
| Querier | Executes PromQL queries from ingesters and store-gateways |
| Query Frontend | Caches query results, optimizes and splits queries |
| Query Scheduler | Manages per-tenant query queues for fairness |
| Store-Gateway | Provides access to historical metric blocks in object storage |
| Compactor | Consolidates and optimizes stored metric data blocks |
| Ruler | Evaluates recording and alerting rules (optional) |
| Alertmanager | Handles alert routing and deduplication (optional) |
Prometheus/OTel → Distributor → Ingester → Object Storage
↓
Hash Ring
(routes by series)Query → Query Frontend → Query Scheduler → Querier
↓
Ingesters (recent)
↓
Store-Gateway (historical)-target=all# Using mimir-distributed Helm chart
distributor:
replicas: 3
ingester:
replicas: 3
zoneAwareReplication:
enabled: true
querier:
replicas: 3
queryFrontend:
replicas: 2
queryScheduler:
replicas: 2
storeGateway:
replicas: 3
compactor:
replicas: 1helm repo add grafana https://grafana.github.io/helm-charts
helm repo updatehelm install mimir grafana/mimir-distributed \
--namespace monitoring \
--values values.yaml| File | Purpose |
|---|---|
| Non-production testing with MinIO |
| ~1 million series (single replicas, not HA) |
| Production (~10 million series) |
# Deployment mode
mimir:
structuredConfig:
multitenancy_enabled: true
# Storage configuration
mimir:
structuredConfig:
common:
storage:
backend: azure # or s3, gcs
azure:
account_name: ${AZURE_STORAGE_ACCOUNT}
account_key: ${AZURE_STORAGE_KEY}
endpoint_suffix: blob.core.windows.net
blocks_storage:
azure:
container_name: mimir-blocks
alertmanager_storage:
azure:
container_name: mimir-alertmanager
ruler_storage:
azure:
container_name: mimir-ruler
# Distributor
distributor:
replicas: 3
resources:
requests:
cpu: 1
memory: 2Gi
limits:
memory: 4Gi
# Ingester
ingester:
replicas: 3
zoneAwareReplication:
enabled: true
persistentVolume:
enabled: true
size: 50Gi
resources:
requests:
cpu: 2
memory: 8Gi
limits:
memory: 16Gi
# Querier
querier:
replicas: 3
resources:
requests:
cpu: 1
memory: 2Gi
limits:
memory: 8Gi
# Query Frontend
query_frontend:
replicas: 2
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
memory: 2Gi
# Query Scheduler
query_scheduler:
replicas: 2
# Store Gateway
store_gateway:
replicas: 3
persistentVolume:
enabled: true
size: 20Gi
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
memory: 8Gi
# Compactor
compactor:
replicas: 1
persistentVolume:
enabled: true
size: 50Gi
resources:
requests:
cpu: 1
memory: 4Gi
limits:
memory: 8Gi
# Gateway for external access
gateway:
enabledNonEnterprise: true
replicas: 2
# Monitoring
metaMonitoring:
serviceMonitor:
enabled: truemimir:
structuredConfig:
common:
storage:
backend: azure
azure:
account_name: <storage-account-name>
# Option 1: Account Key (via environment variable)
account_key: ${AZURE_STORAGE_KEY}
# Option 2: User-Assigned Managed Identity
# user_assigned_id: <identity-client-id>
endpoint_suffix: blob.core.windows.net
blocks_storage:
azure:
container_name: mimir-blocks
alertmanager_storage:
azure:
container_name: mimir-alertmanager
ruler_storage:
azure:
container_name: mimir-rulermimir:
structuredConfig:
common:
storage:
backend: s3
s3:
endpoint: s3.us-east-1.amazonaws.com
region: us-east-1
access_key_id: ${AWS_ACCESS_KEY_ID}
secret_access_key: ${AWS_SECRET_ACCESS_KEY}
blocks_storage:
s3:
bucket_name: mimir-blocks
alertmanager_storage:
s3:
bucket_name: mimir-alertmanager
ruler_storage:
s3:
bucket_name: mimir-rulermimir:
structuredConfig:
common:
storage:
backend: gcs
gcs:
service_account: ${GCS_SERVICE_ACCOUNT_JSON}
blocks_storage:
gcs:
bucket_name: mimir-blocks
alertmanager_storage:
gcs:
bucket_name: mimir-alertmanager
ruler_storage:
gcs:
bucket_name: mimir-rulermimir:
structuredConfig:
limits:
# Ingestion limits
ingestion_rate: 25000 # Samples/sec per tenant
ingestion_burst_size: 50000 # Burst size
max_series_per_metric: 10000
max_series_per_user: 1000000
max_global_series_per_user: 1000000
max_label_names_per_series: 30
max_label_name_length: 1024
max_label_value_length: 2048
# Query limits
max_fetched_series_per_query: 100000
max_fetched_chunks_per_query: 2000000
max_query_lookback: 0 # No limit
max_query_parallelism: 32
# Retention
compactor_blocks_retention_period: 365d # 1 year
# Out-of-order samples
out_of_order_time_window: 5m# runtime-config.yaml
overrides:
tenant1:
ingestion_rate: 50000
max_series_per_user: 2000000
compactor_blocks_retention_period: 730d # 2 years
tenant2:
ingestion_rate: 75000
max_global_series_per_user: 5000000mimir:
structuredConfig:
runtime_config:
file: /etc/mimir/runtime-config.yaml
period: 10smimir:
structuredConfig:
distributor:
ha_tracker:
enable_ha_tracker: true
kvstore:
store: memberlist
cluster_label: cluster
replica_label: __replica__
memberlist:
join_members:
- mimir-gossip-ring.monitoring.svc.cluster.local:7946global:
external_labels:
cluster: prom-team1
__replica__: replica1
remote_write:
- url: http://mimir-gateway:8080/api/v1/push
headers:
X-Scope-OrgID: my-tenantingester:
zoneAwareReplication:
enabled: true
zones:
- name: zone-a
nodeSelector:
topology.kubernetes.io/zone: us-east-1a
- name: zone-b
nodeSelector:
topology.kubernetes.io/zone: us-east-1b
- name: zone-c
nodeSelector:
topology.kubernetes.io/zone: us-east-1c
store_gateway:
zoneAwareReplication:
enabled: truemimir:
structuredConfig:
limits:
# Write path
ingestion_tenant_shard_size: 3
# Read path
max_queriers_per_tenant: 5
store_gateway_tenant_shard_size: 3exporters:
otlphttp:
endpoint: http://mimir-gateway:8080/otlp
headers:
X-Scope-OrgID: "my-tenant"
service:
pipelines:
metrics:
receivers: [otlp]
exporters: [otlphttp]// Go SDK configuration
Aggregation: metric.AggregationBase2ExponentialHistogram{
MaxSize: 160, // Maximum buckets
MaxScale: 20, // Scale factor
}mimir:
structuredConfig:
multitenancy_enabled: true
no_auth_tenant: anonymous # Used when multitenancy disabledcurl -H "X-Scope-OrgID: tenant-a" \
"http://mimir:8080/prometheus/api/v1/query?query=up"!-_.*'()...__mimir_cluster# Prometheus remote write
POST /api/v1/push
# OTLP metrics
POST /otlp/v1/metrics
# InfluxDB line protocol
POST /api/v1/push/influx/write# Instant query
GET,POST /prometheus/api/v1/query?query=<promql>&time=<timestamp>
# Range query
GET,POST /prometheus/api/v1/query_range?query=<promql>&start=<start>&end=<end>&step=<step>
# Labels
GET,POST /prometheus/api/v1/labels
GET /prometheus/api/v1/label/{name}/values
# Series
GET,POST /prometheus/api/v1/series
# Exemplars
GET,POST /prometheus/api/v1/query_exemplars
# Cardinality
GET,POST /prometheus/api/v1/cardinality/label_names
GET,POST /prometheus/api/v1/cardinality/active_series# Flush ingester data
GET,POST /ingester/flush
# Prepare shutdown
GET,POST,DELETE /ingester/prepare-shutdown
# Ring status
GET /ingester/ring
GET /distributor/ring
GET /store-gateway/ring
GET /compactor/ring
# Tenant stats
GET /distributor/all_user_stats
GET /api/v1/user_stats
GET /api/v1/user_limitsGET /ready
GET /metrics
GET /config
GET /config?mode=diff
GET /runtime_configaz identity create \
--name mimir-identity \
--resource-group <rg>
IDENTITY_CLIENT_ID=$(az identity show --name mimir-identity --resource-group <rg> --query clientId -o tsv)
IDENTITY_PRINCIPAL_ID=$(az identity show --name mimir-identity --resource-group <rg> --query principalId -o tsv)az vmss identity assign \
--resource-group <aks-node-rg> \
--name <vmss-name> \
--identities /subscriptions/<sub>/resourceGroups/<rg>/providers/Microsoft.ManagedIdentity/userAssignedIdentities/mimir-identityaz role assignment create \
--role "Storage Blob Data Contributor" \
--assignee-object-id $IDENTITY_PRINCIPAL_ID \
--scope /subscriptions/<sub>/resourceGroups/<rg>/providers/Microsoft.Storage/storageAccounts/<storage>mimir:
structuredConfig:
common:
storage:
azure:
user_assigned_id: <IDENTITY_CLIENT_ID>az identity federated-credential create \
--name mimir-federated \
--identity-name mimir-identity \
--resource-group <rg> \
--issuer <aks-oidc-issuer-url> \
--subject system:serviceaccount:monitoring:mimir \
--audiences api://AzureADTokenExchangeserviceAccount:
annotations:
azure.workload.identity/client-id: <IDENTITY_CLIENT_ID>
podLabels:
azure.workload.identity/use: "true"# Create required containers
az storage container create --name mimir-blocks --account-name <storage>
az storage container create --name mimir-alertmanager --account-name <storage>
az storage container create --name mimir-ruler --account-name <storage># Verify RBAC assignment
az role assignment list --scope /subscriptions/<sub>/resourceGroups/<rg>/providers/Microsoft.Storage/storageAccounts/<storage>
# Assign if missing
az role assignment create \
--role "Storage Blob Data Contributor" \
--assignee-object-id <principal-id> \
--scope <storage-scope>
# Restart pod to refresh token
kubectl delete pod -n monitoring <ingester-pod>ingester:
resources:
limits:
memory: 16Gi # Increase memorymimir:
structuredConfig:
querier:
timeout: 5m
max_concurrent: 20mimir:
structuredConfig:
limits:
max_series_per_user: 5000000
max_series_per_metric: 50000# Check pod status
kubectl get pods -n monitoring -l app.kubernetes.io/name=mimir
# Check ingester logs
kubectl logs -n monitoring -l app.kubernetes.io/component=ingester --tail=100
# Check distributor logs
kubectl logs -n monitoring -l app.kubernetes.io/component=distributor --tail=100
# Verify readiness
kubectl exec -it <mimir-pod> -n monitoring -- wget -qO- http://localhost:8080/ready
# Check ring status
kubectl port-forward svc/mimir-distributor 8080:8080 -n monitoring
curl http://localhost:8080/distributor/ring
# Check configuration
kubectl exec -it <mimir-pod> -n monitoring -- cat /etc/mimir/mimir.yaml
# Validate configuration before deployment
mimir -modules -config.file <path-to-config-file># Ingestion rate per tenant
sum by (user) (rate(cortex_distributor_received_samples_total[5m]))
# Series count per tenant
sum by (user) (cortex_ingester_memory_series)
# Query latency
histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{route=~"/api/prom/api/v1/query.*"}[5m])))
# Compactor status
cortex_compactor_runs_completed_total
cortex_compactor_runs_failed_total
# Store-gateway block sync
cortex_bucket_store_blocks_loadedmimir:
structuredConfig:
ingester:
push_circuit_breaker:
enabled: true
request_timeout: 2s
failure_threshold_percentage: 10
cooldown_period: 10s
read_circuit_breaker:
enabled: true
request_timeout: 30s