feat: migrate observability stack to flux gitops
This commit is contained in:
43
README.md
43
README.md
@@ -174,6 +174,47 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
|
||||
| `SSH_PUBLIC_KEY` | SSH public key content |
|
||||
| `SSH_PRIVATE_KEY` | SSH private key content |
|
||||
|
||||
## GitOps (Flux)
|
||||
|
||||
This repo now includes a Flux GitOps layout for phased migration from imperative Ansible applies to continuous reconciliation.
|
||||
|
||||
### Repository layout
|
||||
|
||||
- `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
|
||||
- `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
|
||||
- `infrastructure/`: infrastructure addon reconciliation graph
|
||||
- `infrastructure/addons/*`: per-addon manifests (observability + observability-content migrated)
|
||||
- `apps/`: application workload layer (currently scaffolded)
|
||||
|
||||
### Reconciliation graph
|
||||
|
||||
- `infrastructure` (top-level)
|
||||
- `addon-ccm`
|
||||
- `addon-csi` depends on `addon-ccm`
|
||||
- `addon-tailscale-operator`
|
||||
- `addon-observability`
|
||||
- `addon-observability-content` depends on `addon-observability`
|
||||
- `apps` depends on `infrastructure`
|
||||
|
||||
### Bootstrap notes
|
||||
|
||||
1. Install Flux controllers in `flux-system`.
|
||||
2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
|
||||
3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
|
||||
4. Unsuspend addon `Kustomization` objects one-by-one as each addon is migrated from Ansible.
|
||||
|
||||
### Current migration status
|
||||
|
||||
- `addon-observability-content` is now GitOps-managed from `infrastructure/addons/observability-content/`.
|
||||
- `addon-observability` is now GitOps-managed from `infrastructure/addons/observability/` using Flux `HelmRelease` resources for:
|
||||
- `kube-prometheus-stack`
|
||||
- `loki`
|
||||
- `promtail`
|
||||
- Remaining addons stay suspended until migrated.
|
||||
- During transition, avoid applying Grafana content from both Flux and Ansible at the same time.
|
||||
|
||||
Ansible `site.yml` now skips `observability` and `observability-content` roles by default when `observability_gitops_enabled=true` (default).
|
||||
|
||||
## Observability Stack
|
||||
|
||||
The Ansible playbook deploys a lightweight observability stack in the `observability` namespace:
|
||||
@@ -182,7 +223,7 @@ The Ansible playbook deploys a lightweight observability stack in the `observabi
|
||||
- `loki`
|
||||
- `promtail`
|
||||
|
||||
Grafana content is managed as code via ConfigMaps in `ansible/roles/observability-content/`.
|
||||
Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/` (Flux), migrated from `ansible/roles/observability-content/`.
|
||||
|
||||
Services are kept internal by default, with optional declarative Tailscale exposure when the Tailscale Kubernetes Operator is healthy.
|
||||
|
||||
|
||||
@@ -99,6 +99,7 @@
|
||||
- name: Deploy observability stack
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
when: not (observability_gitops_enabled | default(true) | bool)
|
||||
|
||||
roles:
|
||||
- observability
|
||||
@@ -106,6 +107,7 @@
|
||||
- name: Provision Grafana content
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
when: not (observability_gitops_enabled | default(true) | bool)
|
||||
|
||||
roles:
|
||||
- observability-content
|
||||
|
||||
3
apps/kustomization.yaml
Normal file
3
apps/kustomization.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
12
clusters/prod/flux-system/gitrepository-platform.yaml
Normal file
12
clusters/prod/flux-system/gitrepository-platform.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: GitRepository
|
||||
metadata:
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m
|
||||
ref:
|
||||
branch: main
|
||||
url: ssh://git@tea.michaelfisher.tech/HomeInfra/HetznerTerra.git
|
||||
secretRef:
|
||||
name: flux-system
|
||||
17
clusters/prod/flux-system/kustomization-apps.yaml
Normal file
17
clusters/prod/flux-system/kustomization-apps.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: apps
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./apps
|
||||
dependsOn:
|
||||
- name: infrastructure
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
14
clusters/prod/flux-system/kustomization-infrastructure.yaml
Normal file
14
clusters/prod/flux-system/kustomization-infrastructure.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: infrastructure
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure
|
||||
wait: true
|
||||
timeout: 5m
|
||||
6
clusters/prod/flux-system/kustomization.yaml
Normal file
6
clusters/prod/flux-system/kustomization.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- gitrepository-platform.yaml
|
||||
- kustomization-infrastructure.yaml
|
||||
- kustomization-apps.yaml
|
||||
4
clusters/prod/kustomization.yaml
Normal file
4
clusters/prod/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- flux-system
|
||||
3
infrastructure/addons/ccm/kustomization.yaml
Normal file
3
infrastructure/addons/ccm/kustomization.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
3
infrastructure/addons/csi/kustomization.yaml
Normal file
3
infrastructure/addons/csi/kustomization.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
15
infrastructure/addons/kustomization-ccm.yaml
Normal file
15
infrastructure/addons/kustomization-ccm.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-ccm
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/ccm
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
17
infrastructure/addons/kustomization-csi.yaml
Normal file
17
infrastructure/addons/kustomization-csi.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-csi
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/csi
|
||||
dependsOn:
|
||||
- name: addon-ccm
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability-content
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability-content
|
||||
dependsOn:
|
||||
- name: addon-observability
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
15
infrastructure/addons/kustomization-observability.yaml
Normal file
15
infrastructure/addons/kustomization-observability.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
15
infrastructure/addons/kustomization-tailscale-operator.yaml
Normal file
15
infrastructure/addons/kustomization-tailscale-operator.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-tailscale-operator
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/tailscale-operator
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
8
infrastructure/addons/kustomization.yaml
Normal file
8
infrastructure/addons/kustomization.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- kustomization-ccm.yaml
|
||||
- kustomization-csi.yaml
|
||||
- kustomization-tailscale-operator.yaml
|
||||
- kustomization-observability.yaml
|
||||
- kustomization-observability-content.yaml
|
||||
@@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-k8s-overview
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
k8s-overview.json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"legendFormat": "ready",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ready Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||
"legendFormat": "cpu",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cluster CPU Usage",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["kubernetes", "infrastructure"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"timezone": "browser",
|
||||
"title": "K8s Cluster Overview",
|
||||
"uid": "k8s-cluster-overview",
|
||||
"version": 1
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources-core
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: "http://loki.observability.svc.cluster.local:3100"
|
||||
isDefault: false
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- grafana-datasources-core-configmap.yaml
|
||||
- grafana-dashboard-k8s-overview-configmap.yaml
|
||||
@@ -0,0 +1,63 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: kube-prometheus-stack
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: kube-prometheus-stack
|
||||
version: 68.4.4
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: prometheus-community
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
grafana:
|
||||
enabled: true
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: local-path
|
||||
size: 5Gi
|
||||
service:
|
||||
type: ClusterIP
|
||||
sidecar:
|
||||
datasources:
|
||||
enabled: true
|
||||
label: grafana_datasource
|
||||
searchNamespace: observability
|
||||
dashboards:
|
||||
enabled: true
|
||||
label: grafana_dashboard
|
||||
searchNamespace: observability
|
||||
prometheus:
|
||||
service:
|
||||
type: ClusterIP
|
||||
prometheusSpec:
|
||||
retention: 7d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: local-path
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
alertmanager:
|
||||
enabled: false
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
93
infrastructure/addons/observability/helmrelease-loki.yaml
Normal file
93
infrastructure/addons/observability/helmrelease-loki.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: loki
|
||||
version: 6.10.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
deploymentMode: SingleBinary
|
||||
loki:
|
||||
auth_enabled: false
|
||||
commonConfig:
|
||||
replication_factor: 1
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: "2024-04-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
storage:
|
||||
type: filesystem
|
||||
limits_config:
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
retention_period: 168h
|
||||
pattern_ingester:
|
||||
enabled: true
|
||||
ruler:
|
||||
enable_api: true
|
||||
singleBinary:
|
||||
replicas: 1
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: local-path
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
backend:
|
||||
replicas: 0
|
||||
read:
|
||||
replicas: 0
|
||||
write:
|
||||
replicas: 0
|
||||
ingester:
|
||||
replicas: 0
|
||||
querier:
|
||||
replicas: 0
|
||||
queryFrontend:
|
||||
replicas: 0
|
||||
queryScheduler:
|
||||
replicas: 0
|
||||
distributor:
|
||||
replicas: 0
|
||||
compactor:
|
||||
replicas: 0
|
||||
indexGateway:
|
||||
replicas: 0
|
||||
bloomCompactor:
|
||||
replicas: 0
|
||||
bloomGateway:
|
||||
replicas: 0
|
||||
gateway:
|
||||
enabled: false
|
||||
test:
|
||||
enabled: false
|
||||
monitoring:
|
||||
selfMonitoring:
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
@@ -0,0 +1,27 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: promtail
|
||||
version: 6.16.6
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
config:
|
||||
clients:
|
||||
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://grafana.github.io/helm-charts
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: prometheus-community
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://prometheus-community.github.io/helm-charts
|
||||
9
infrastructure/addons/observability/kustomization.yaml
Normal file
9
infrastructure/addons/observability/kustomization.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-prometheus-community.yaml
|
||||
- helmrepository-grafana.yaml
|
||||
- helmrelease-kube-prometheus-stack.yaml
|
||||
- helmrelease-loki.yaml
|
||||
- helmrelease-promtail.yaml
|
||||
4
infrastructure/addons/observability/namespace.yaml
Normal file
4
infrastructure/addons/observability/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
4
infrastructure/kustomization.yaml
Normal file
4
infrastructure/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- addons
|
||||
Reference in New Issue
Block a user