Master Kubernetes observability with Prometheus, Grafana, ELK Stack, and distributed tracing. Build complete monitoring and logging solutions for production workloads.
Numerical data about system performance
Detailed records of events
Request flow through services
Expose metrics
Collect & Store
Visualize
Notify
Generate logs
Collect & Forward
Store & Index
Search & Visualize
Stack | Components | Use Case | Strengths |
---|---|---|---|
Prometheus Stack | Prometheus + Grafana + Alertmanager | Metrics & Monitoring | Native K8s support, powerful queries |
ELK Stack | Elasticsearch + Logstash + Kibana | Log aggregation | Full-text search, rich visualizations |
EFK Stack | Elasticsearch + Fluentd + Kibana | K8s logging | Cloud-native, lightweight |
Loki Stack | Loki + Promtail + Grafana | Lightweight logging | Cost-effective, Prometheus-like |
Jaeger | Jaeger + OpenTelemetry | Distributed tracing | End-to-end tracing, OpenTracing support |
# Add Prometheus community Helm repo
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
# Install kube-prometheus-stack (includes Prometheus, Grafana, Alertmanager)
helm install prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--create-namespace \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \
--set grafana.adminPassword=admin123 \
--set alertmanager.alertmanagerSpec.storage.volumeClaimTemplate.spec.resources.requests.storage=10Gi
# Verify installation
kubectl get pods -n monitoring
kubectl get svc -n monitoring
# Port-forward to access UIs
kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-alertmanager 9093:9093
# Application with Prometheus metrics endpoint
apiVersion: apps/v1
kind: Deployment
metadata:
name: sample-app
spec:
replicas: 3
selector:
matchLabels:
app: sample-app
template:
metadata:
labels:
app: sample-app
spec:
containers:
- name: app
image: myapp:latest
ports:
- containerPort: 8080
name: http
- containerPort: 9090
name: metrics
env:
- name: METRICS_PORT
value: "9090"
---
apiVersion: v1
kind: Service
metadata:
name: sample-app
labels:
app: sample-app
spec:
selector:
app: sample-app
ports:
- name: http
port: 8080
targetPort: 8080
- name: metrics
port: 9090
targetPort: 9090
---
# ServiceMonitor to scrape metrics
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: sample-app-monitor
labels:
app: sample-app
prometheus: kube-prometheus
spec:
selector:
matchLabels:
app: sample-app
endpoints:
- port: metrics
interval: 30s
path: /metrics
honorLabels: true
relabelings:
- sourceLabels: [__meta_kubernetes_pod_name]
targetLabel: pod
- sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: node
package main
import (
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// Counter metric
requestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
// Gauge metric
activeConnections = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Number of active connections",
},
)
// Histogram metric
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request latencies in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
// Summary metric
requestSize = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "http_request_size_bytes",
Help: "HTTP request sizes in bytes",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"method"},
)
)
func init() {
// Register metrics
prometheus.MustRegister(requestsTotal)
prometheus.MustRegister(activeConnections)
prometheus.MustRegister(requestDuration)
prometheus.MustRegister(requestSize)
}
func main() {
// Expose metrics endpoint
http.Handle("/metrics", promhttp.Handler())
// Example usage in handlers
http.HandleFunc("/api", func(w http.ResponseWriter, r *http.Request) {
timer := prometheus.NewTimer(requestDuration.WithLabelValues(r.Method, "/api"))
defer timer.ObserveDuration()
requestsTotal.WithLabelValues(r.Method, "/api", "200").Inc()
requestSize.WithLabelValues(r.Method).Observe(float64(r.ContentLength))
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
})
http.ListenAndServe(":8080", nil)
}
rate(container_cpu_usage_seconds_total[5m]) * 100
CPU usage percentage over 5 minutes
container_memory_working_set_bytes / container_spec_memory_limit_bytes * 100
Memory usage percentage
sum(rate(http_requests_total[5m])) by (service)
Requests per second by service
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
5xx error percentage
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
95th percentile latency
increase(kube_pod_container_status_restarts_total[1h])
Container restarts in last hour
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: app-alerts
namespace: monitoring
labels:
prometheus: kube-prometheus
spec:
groups:
- name: app.rules
interval: 30s
rules:
# High CPU Usage
- alert: HighCPUUsage
expr: |
(sum(rate(container_cpu_usage_seconds_total[5m])) by (pod, namespace) * 100) > 80
for: 5m
labels:
severity: warning
component: pod
annotations:
summary: "High CPU usage on pod {{ $labels.pod }}"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is above 80% (current value: {{ $value }}%)"
# High Memory Usage
- alert: HighMemoryUsage
expr: |
(container_memory_working_set_bytes / container_spec_memory_limit_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on pod {{ $labels.pod }}"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is above 90%"
# Pod Crash Looping
- alert: PodCrashLooping
expr: |
increase(kube_pod_container_status_restarts_total[15m]) > 3
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.pod }} is crash looping"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes"
# High Error Rate
- alert: HighErrorRate
expr: |
(sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate for service {{ $labels.service }}"
description: "Service {{ $labels.service }} has error rate above 5% (current: {{ $value | humanizePercentage }})"
# Disk Space Low
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on node {{ $labels.instance }}"
description: "Node {{ $labels.instance }} has less than 10% disk space available"
{
"dashboard": {
"title": "Kubernetes Application Dashboard",
"panels": [
{
"id": 1,
"title": "CPU Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$pod\"}[5m])) by (pod) * 100",
"legendFormat": "{{pod}}"
}
],
"yaxes": [
{
"format": "percent",
"label": "CPU Usage"
}
]
},
{
"id": 2,
"title": "Memory Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\",pod=~\"$pod\"}) by (pod) / 1024 / 1024",
"legendFormat": "{{pod}}"
}
],
"yaxes": [
{
"format": "decmbytes",
"label": "Memory"
}
]
},
{
"id": 3,
"title": "Request Rate",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "sum(rate(http_requests_total{namespace=\"$namespace\"}[5m])) by (service, method)",
"legendFormat": "{{service}} - {{method}}"
}
]
},
{
"id": 4,
"title": "Error Rate",
"type": "singlestat",
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 8},
"targets": [
{
"expr": "sum(rate(http_requests_total{namespace=\"$namespace\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{namespace=\"$namespace\"}[5m])) * 100"
}
],
"format": "percent",
"thresholds": "1,5",
"colors": ["green", "yellow", "red"]
}
],
"templating": {
"list": [
{
"name": "namespace",
"type": "query",
"query": "label_values(kube_pod_info, namespace)",
"refresh": 1
},
{
"name": "pod",
"type": "query",
"query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)",
"refresh": 1,
"multi": true
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
k8s-cluster-dashboard.json: |
{
"dashboard": {
"title": "Kubernetes Cluster Overview",
"panels": [...],
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"schemaVersion": 27,
"style": "dark",
"tags": ["kubernetes", "prometheus"],
"templating": {...},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"uid": "k8s-cluster-overview",
"version": 0
}
}
---
# Grafana sidecar configuration to auto-import dashboards
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-provider
namespace: monitoring
data:
provider.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
defaultFolderTitle: "General"
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: monitoring
data:
datasources.yaml: |
apiVersion: 1
datasources:
# Prometheus
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-kube-prometheus-prometheus:9090
isDefault: true
editable: true
jsonData:
timeInterval: 30s
queryTimeout: 60s
httpMethod: POST
# Loki for logs
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true
jsonData:
maxLines: 1000
# Elasticsearch
- name: Elasticsearch
type: elasticsearch
access: proxy
url: http://elasticsearch:9200
database: "[logstash-]YYYY.MM.DD"
jsonData:
esVersion: "7.10.0"
timeField: "@timestamp"
interval: Daily
logMessageField: message
logLevelField: level
# Jaeger for tracing
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger-query:16686
editable: true
# Elasticsearch StatefulSet
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: elasticsearch
namespace: logging
spec:
serviceName: elasticsearch
replicas: 3
selector:
matchLabels:
app: elasticsearch
template:
metadata:
labels:
app: elasticsearch
spec:
initContainers:
- name: init-sysctl
image: busybox
command:
- sysctl
- -w
- vm.max_map_count=262144
securityContext:
privileged: true
containers:
- name: elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch:7.15.0
ports:
- containerPort: 9200
name: rest
- containerPort: 9300
name: transport
env:
- name: cluster.name
value: k8s-logs
- name: node.name
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: discovery.seed_hosts
value: "elasticsearch-0.elasticsearch,elasticsearch-1.elasticsearch,elasticsearch-2.elasticsearch"
- name: cluster.initial_master_nodes
value: "elasticsearch-0,elasticsearch-1,elasticsearch-2"
- name: ES_JAVA_OPTS
value: "-Xms1g -Xmx1g"
- name: xpack.security.enabled
value: "false"
volumeMounts:
- name: data
mountPath: /usr/share/elasticsearch/data
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 30Gi
---
# Fluentd DaemonSet
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: fluentd
namespace: logging
spec:
selector:
matchLabels:
app: fluentd
template:
metadata:
labels:
app: fluentd
spec:
serviceAccount: fluentd
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
containers:
- name: fluentd
image: fluent/fluentd-kubernetes-daemonset:v1-debian-elasticsearch
env:
- name: FLUENT_ELASTICSEARCH_HOST
value: "elasticsearch.logging.svc.cluster.local"
- name: FLUENT_ELASTICSEARCH_PORT
value: "9200"
- name: FLUENT_ELASTICSEARCH_SCHEME
value: "http"
- name: FLUENT_ELASTICSEARCH_USER
value: ""
- name: FLUENT_ELASTICSEARCH_PASSWORD
value: ""
- name: FLUENT_ELASTICSEARCH_LOGSTASH_FORMAT
value: "true"
- name: FLUENT_ELASTICSEARCH_LOGSTASH_PREFIX
value: "k8s"
- name: FLUENT_ELASTICSEARCH_BUFFER_CHUNK_LIMIT_SIZE
value: "2M"
- name: FLUENT_ELASTICSEARCH_BUFFER_QUEUE_LIMIT_LENGTH
value: "32"
- name: FLUENT_ELASTICSEARCH_FLUSH_INTERVAL
value: "5s"
- name: FLUENT_ELASTICSEARCH_MAX_RETRY_WAIT
value: "30"
- name: FLUENT_ELASTICSEARCH_DISABLE_RETRY_LIMIT
value: "true"
volumeMounts:
- name: varlog
mountPath: /var/log
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
- name: config
mountPath: /fluentd/etc
volumes:
- name: varlog
hostPath:
path: /var/log
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
- name: config
configMap:
name: fluentd-config
---
# Kibana Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: kibana
namespace: logging
spec:
replicas: 1
selector:
matchLabels:
app: kibana
template:
metadata:
labels:
app: kibana
spec:
containers:
- name: kibana
image: docker.elastic.co/kibana/kibana:7.15.0
ports:
- containerPort: 5601
env:
- name: ELASTICSEARCH_HOSTS
value: "http://elasticsearch:9200"
- name: SERVER_NAME
value: "kibana"
- name: SERVER_HOST
value: "0.0.0.0"
apiVersion: v1
kind: ConfigMap
metadata:
name: fluentd-config
namespace: logging
data:
fluent.conf: |
# Input plugins
@type tail
path /var/log/containers/*.log
pos_file /var/log/fluentd-containers.log.pos
tag kubernetes.*
read_from_head true
@type json
time_format %Y-%m-%dT%H:%M:%S.%NZ
# Filter plugins
@type kubernetes_metadata
@id filter_kube_metadata
kubernetes_url "#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV['KUBERNETES_SERVICE_HOST'] + ':' + ENV['KUBERNETES_SERVICE_PORT'] + '/api'}"
verify_ssl "#{ENV['KUBERNETES_VERIFY_SSL'] || true}"
ca_file "#{ENV['KUBERNETES_CA_FILE']}"
skip_labels "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}"
skip_container_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}"
skip_master_url "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}"
skip_namespace_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}"
# Parse application logs
@type parser
key_name log
reserve_data true
remove_key_name_field true
@type multi_format
format json
format regexp
expression /^(?
format none
# Add custom fields
@type record_transformer
cluster_name ${record["kubernetes"]["cluster_name"]}
namespace ${record["kubernetes"]["namespace_name"]}
pod_name ${record["kubernetes"]["pod_name"]}
container_name ${record["kubernetes"]["container_name"]}
host ${record["kubernetes"]["host"]}
labels ${record["kubernetes"]["labels"]}
# Output to Elasticsearch
@type elasticsearch
@id out_es
@log_level info
include_tag_key true
host "#{ENV['FLUENT_ELASTICSEARCH_HOST']}"
port "#{ENV['FLUENT_ELASTICSEARCH_PORT']}"
path "#{ENV['FLUENT_ELASTICSEARCH_PATH']}"
scheme "#{ENV['FLUENT_ELASTICSEARCH_SCHEME'] || 'http'}"
ssl_verify "#{ENV['FLUENT_ELASTICSEARCH_SSL_VERIFY'] || 'true'}"
ssl_version "#{ENV['FLUENT_ELASTICSEARCH_SSL_VERSION'] || 'TLSv1_2'}"
user "#{ENV['FLUENT_ELASTICSEARCH_USER']}"
password "#{ENV['FLUENT_ELASTICSEARCH_PASSWORD']}"
logstash_format true
logstash_prefix "#{ENV['FLUENT_ELASTICSEARCH_LOGSTASH_PREFIX'] || 'logstash'}"
logstash_dateformat %Y.%m.%d
include_timestamp true
type_name _doc
tag_key @log_name
@type memory
flush_interval 5s
chunk_limit_size 2M
queue_limit_length 32
retry_max_interval 30
retry_forever true
# Install Loki Stack with Helm
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# Install Loki with Promtail
helm install loki grafana/loki-stack \
--namespace logging \
--create-namespace \
--set grafana.enabled=false \
--set prometheus.enabled=false \
--set loki.persistence.enabled=true \
--set loki.persistence.size=10Gi \
--set promtail.enabled=true
# Verify installation
kubectl get pods -n logging
# Configure Grafana data source for Loki
# URL: http://loki.logging.svc.cluster.local:3100
# Jaeger Operator
kubectl create namespace observability
kubectl create -f https://github.com/jaegertracing/jaeger-operator/releases/download/v1.37.0/jaeger-operator.yaml -n observability
# Jaeger Instance
apiVersion: jaegertracing.io/v1
kind: Jaeger
metadata:
name: jaeger
namespace: observability
spec:
strategy: production
storage:
type: elasticsearch
options:
es:
server-urls: http://elasticsearch:9200
index-prefix: jaeger
ingress:
enabled: true
agent:
strategy: DaemonSet
collector:
replicas: 2
autoscale: true
maxReplicas: 5
resources:
limits:
cpu: 1
memory: 1Gi
query:
replicas: 2
resources:
limits:
cpu: 500m
memory: 512Mi
package main
import (
"context"
"log"
"net/http"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/jaeger"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
"go.opentelemetry.io/otel/trace"
)
func initTracer() func() {
// Create Jaeger exporter
exp, err := jaeger.New(jaeger.WithCollectorEndpoint(
jaeger.WithEndpoint("http://jaeger-collector:14268/api/traces"),
))
if err != nil {
log.Fatal(err)
}
// Create trace provider
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exp),
sdktrace.WithResource(resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceNameKey.String("my-service"),
semconv.ServiceVersionKey.String("1.0.0"),
attribute.String("environment", "production"),
)),
)
// Register trace provider
otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(propagation.TraceContext{})
return func() {
if err := tp.Shutdown(context.Background()); err != nil {
log.Printf("Error shutting down tracer provider: %v", err)
}
}
}
func handleRequest(w http.ResponseWriter, r *http.Request) {
// Start a span
ctx := r.Context()
tracer := otel.Tracer("my-service")
ctx, span := tracer.Start(ctx, "handleRequest",
trace.WithAttributes(
attribute.String("http.method", r.Method),
attribute.String("http.url", r.URL.String()),
),
)
defer span.End()
// Simulate some work
processRequest(ctx)
// Add event to span
span.AddEvent("request processed",
trace.WithAttributes(
attribute.String("user", r.Header.Get("User-ID")),
),
)
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
}
func processRequest(ctx context.Context) {
tracer := otel.Tracer("my-service")
_, span := tracer.Start(ctx, "processRequest")
defer span.End()
// Simulate database call
dbCall(ctx)
// Simulate external API call
apiCall(ctx)
}
func dbCall(ctx context.Context) {
tracer := otel.Tracer("my-service")
_, span := tracer.Start(ctx, "database.query",
trace.WithAttributes(
attribute.String("db.system", "postgresql"),
attribute.String("db.statement", "SELECT * FROM users"),
),
)
defer span.End()
// Simulate DB operation
}
func apiCall(ctx context.Context) {
tracer := otel.Tracer("my-service")
_, span := tracer.Start(ctx, "http.client",
trace.WithAttributes(
attribute.String("http.url", "https://api.example.com/data"),
attribute.String("http.method", "GET"),
),
)
defer span.End()
// Simulate API call
}
func main() {
cleanup := initTracer()
defer cleanup()
http.HandleFunc("/", handleRequest)
log.Fatal(http.ListenAndServe(":8080", nil))
}
# Enable tracing in Istio
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
metadata:
name: istio-control-plane
spec:
meshConfig:
defaultConfig:
proxyStatsMatcher:
inclusionRegexps:
- ".*outlier_detection.*"
- ".*circuit_breakers.*"
- ".*upstream_rq_retry.*"
- ".*upstream_rq_pending.*"
- ".*_cx_.*"
extensionProviders:
- name: jaeger
envoyExtAuthzHttp:
service: jaeger-collector.observability.svc.cluster.local
port: 9411
pathPrefix: /api/v2/spans
defaultProviders:
tracing:
- jaeger
values:
telemetry:
v2:
prometheus:
configOverride:
inboundSidecar:
disable_host_header_fallback: true
outboundSidecar:
disable_host_header_fallback: true
pilot:
traceSampling: 100.0 # 100% sampling for demo, use lower value in production
---
# Telemetry configuration
apiVersion: telemetry.istio.io/v1alpha1
kind: Telemetry
metadata:
name: default-tracing
namespace: istio-system
spec:
tracing:
- providers:
- name: jaeger
randomSamplingPercentage: 1.0 # 1% sampling in production
Deploy a full observability stack for a microservices application.
# Complete observability stack deployment
# 1. Namespace setup
kubectl create namespace monitoring
kubectl create namespace logging
kubectl create namespace tracing
# 2. Prometheus Stack (includes Grafana)
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install kube-prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
--set grafana.adminPassword=admin123
# 3. EFK Stack
# Elasticsearch
cat <
Implement Service Level Objectives monitoring.
# SLO Monitoring with Prometheus
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: slo-rules
namespace: monitoring
spec:
groups:
- name: slo.rules
interval: 30s
rules:
# Availability SLI
- record: sli:availability:ratio_rate5m
expr: |
sum(rate(http_requests_total{status!~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
# Latency SLI (P99)
- record: sli:latency:p99_5m
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
)
# Error Budget
- record: error_budget:remaining
expr: |
1 - ((1 - sli:availability:ratio_rate5m) / (1 - 0.999))
# Multi-window alerts
- alert: SLOAvailabilityBreach
expr: |
(
sli:availability:ratio_rate5m < 0.999
AND
sli:availability:ratio_rate30m < 0.999
AND
sli:availability:ratio_rate1h < 0.999
)
for: 5m
labels:
severity: critical
slo: availability
annotations:
summary: "SLO Availability breach for {{ $labels.service }}"
description: "Service {{ $labels.service }} availability is {{ $value | humanizePercentage }}, below 99.9% SLO"
- alert: ErrorBudgetBurnRateHigh
expr: |
(
(1 - sli:availability:ratio_rate5m) * 2880 > 0.01
AND
(1 - sli:availability:ratio_rate1h) * 120 > 0.01
)
labels:
severity: warning
annotations:
summary: "High error budget burn rate for {{ $labels.service }}"
description: "Service {{ $labels.service }} is consuming error budget at {{ $value }}x normal rate"
---
# Grafana Dashboard for SLOs
apiVersion: v1
kind: ConfigMap
metadata:
name: slo-dashboard
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
slo-dashboard.json: |
{
"dashboard": {
"title": "SLO Dashboard",
"panels": [
{
"title": "Availability SLO",
"targets": [{
"expr": "sli:availability:ratio_rate5m * 100"
}],
"thresholds": [{
"value": 99.9,
"color": "green"
}, {
"value": 99.5,
"color": "yellow"
}, {
"value": 99,
"color": "red"
}]
},
{
"title": "Error Budget Remaining",
"targets": [{
"expr": "error_budget:remaining * 100"
}],
"type": "gauge",
"options": {
"minValue": 0,
"maxValue": 100
}
},
{
"title": "P99 Latency",
"targets": [{
"expr": "sli:latency:p99_5m"
}],
"yaxes": [{
"format": "s",
"label": "Latency"
}]
},
{
"title": "Error Rate",
"targets": [{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) * 100"
}],
"yaxes": [{
"format": "percent"
}]
}
]
}
}
Design observability for multi-cluster deployments.