feat: manage grafana content as code with fast dashboard workflow
This commit is contained in:
91
.gitea/workflows/dashboards.yml
Normal file
91
.gitea/workflows/dashboards.yml
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
name: Deploy Grafana Content
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "ansible/dashboards.yml"
|
||||||
|
- "ansible/roles/observability-content/**"
|
||||||
|
- ".gitea/workflows/dashboards.yml"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
TF_VERSION: "1.7.0"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
dashboards:
|
||||||
|
name: Grafana Content
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Terraform
|
||||||
|
uses: hashicorp/setup-terraform@v3
|
||||||
|
with:
|
||||||
|
terraform_version: ${{ env.TF_VERSION }}
|
||||||
|
|
||||||
|
- name: Setup SSH Keys
|
||||||
|
run: |
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||||
|
chmod 600 ~/.ssh/id_ed25519
|
||||||
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||||
|
chmod 644 ~/.ssh/id_ed25519.pub
|
||||||
|
|
||||||
|
- name: Terraform Init
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
terraform init \
|
||||||
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
|
-backend-config="region=auto" \
|
||||||
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||||
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||||
|
-backend-config="skip_requesting_account_id=true"
|
||||||
|
|
||||||
|
- name: Detect runner egress IP
|
||||||
|
run: |
|
||||||
|
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||||
|
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||||
|
echo "Runner egress IP: ${RUNNER_IP}"
|
||||||
|
|
||||||
|
- name: Open SSH/API for current runner CIDR
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
terraform apply \
|
||||||
|
-target=hcloud_firewall.cluster \
|
||||||
|
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
|
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||||
|
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||||
|
-auto-approve
|
||||||
|
|
||||||
|
- name: Install Python Dependencies
|
||||||
|
run: |
|
||||||
|
apt-get update && apt-get install -y python3-pip
|
||||||
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||||
|
|
||||||
|
- name: Install Ansible Collections
|
||||||
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||||
|
|
||||||
|
- name: Generate Ansible Inventory
|
||||||
|
working-directory: ansible
|
||||||
|
run: python3 generate_inventory.py
|
||||||
|
|
||||||
|
- name: Apply dashboards and datasources
|
||||||
|
working-directory: ansible
|
||||||
|
run: |
|
||||||
|
ansible-playbook dashboards.yml \
|
||||||
|
-e "cluster_name=k8s-cluster"
|
||||||
|
env:
|
||||||
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
|
|
||||||
|
- name: Verify Grafana content resources
|
||||||
|
working-directory: ansible
|
||||||
|
run: |
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap | grep grafana"
|
||||||
|
env:
|
||||||
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
16
README.md
16
README.md
@@ -152,6 +152,7 @@ This repository includes Gitea workflows for:
|
|||||||
- **terraform-plan**: Runs on PRs, shows planned changes
|
- **terraform-plan**: Runs on PRs, shows planned changes
|
||||||
- **terraform-apply**: Runs on main branch after merge
|
- **terraform-apply**: Runs on main branch after merge
|
||||||
- **ansible-deploy**: Runs after terraform apply
|
- **ansible-deploy**: Runs after terraform apply
|
||||||
|
- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
|
||||||
|
|
||||||
### Required Gitea Secrets
|
### Required Gitea Secrets
|
||||||
|
|
||||||
@@ -181,6 +182,8 @@ The Ansible playbook deploys a lightweight observability stack in the `observabi
|
|||||||
- `loki`
|
- `loki`
|
||||||
- `promtail`
|
- `promtail`
|
||||||
|
|
||||||
|
Grafana content is managed as code via ConfigMaps in `ansible/roles/observability-content/`.
|
||||||
|
|
||||||
Services are kept internal by default, with optional declarative Tailscale exposure when the Tailscale Kubernetes Operator is healthy.
|
Services are kept internal by default, with optional declarative Tailscale exposure when the Tailscale Kubernetes Operator is healthy.
|
||||||
|
|
||||||
### Access Grafana and Prometheus
|
### Access Grafana and Prometheus
|
||||||
@@ -228,6 +231,15 @@ kubectl -n tailscale-system logs deployment/operator --tail=100
|
|||||||
|
|
||||||
Common cause: OAuth client missing tag/scopes permissions.
|
Common cause: OAuth client missing tag/scopes permissions.
|
||||||
|
|
||||||
|
### Fast dashboard iteration workflow
|
||||||
|
|
||||||
|
Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
|
||||||
|
It avoids full cluster provisioning and only applies Grafana content resources:
|
||||||
|
|
||||||
|
- `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
|
||||||
|
- `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
|
||||||
|
- `ansible/dashboards.yml`
|
||||||
|
|
||||||
## File Structure
|
## File Structure
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -252,12 +264,14 @@ Common cause: OAuth client missing tag/scopes permissions.
|
|||||||
│ │ ├── ccm/
|
│ │ ├── ccm/
|
||||||
│ │ ├── csi/
|
│ │ ├── csi/
|
||||||
│ │ ├── tailscale-operator/
|
│ │ ├── tailscale-operator/
|
||||||
|
│ │ ├── observability-content/
|
||||||
│ │ └── observability/
|
│ │ └── observability/
|
||||||
│ └── ansible.cfg
|
│ └── ansible.cfg
|
||||||
├── .gitea/
|
├── .gitea/
|
||||||
│ └── workflows/
|
│ └── workflows/
|
||||||
│ ├── terraform.yml
|
│ ├── terraform.yml
|
||||||
│ └── ansible.yml
|
│ ├── ansible.yml
|
||||||
|
│ └── dashboards.yml
|
||||||
├── outputs/
|
├── outputs/
|
||||||
├── terraform.tfvars.example
|
├── terraform.tfvars.example
|
||||||
└── README.md
|
└── README.md
|
||||||
|
|||||||
7
ansible/dashboards.yml
Normal file
7
ansible/dashboards.yml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
- name: Provision Grafana dashboards and datasources
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- observability-content
|
||||||
5
ansible/roles/observability-content/defaults/main.yml
Normal file
5
ansible/roles/observability-content/defaults/main.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
observability_namespace: "observability"
|
||||||
|
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
|
||||||
|
grafana_datasource_configmap_name: "grafana-datasources-core"
|
||||||
|
loki_enabled: true
|
||||||
37
ansible/roles/observability-content/tasks/main.yml
Normal file
37
ansible/roles/observability-content/tasks/main.yml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure observability namespace exists
|
||||||
|
command: kubectl create namespace {{ observability_namespace }}
|
||||||
|
register: create_observability_ns
|
||||||
|
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||||
|
changed_when: create_observability_ns.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for Grafana deployment rollout
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Write Grafana datasources ConfigMap
|
||||||
|
template:
|
||||||
|
src: grafana-datasources.yaml.j2
|
||||||
|
dest: /tmp/grafana-datasources.yaml
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Apply Grafana datasources ConfigMap
|
||||||
|
command: kubectl apply -f /tmp/grafana-datasources.yaml
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Write Grafana dashboard ConfigMap
|
||||||
|
template:
|
||||||
|
src: grafana-dashboard-k8s-overview.yaml.j2
|
||||||
|
dest: /tmp/grafana-dashboard-k8s-overview.yaml
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Apply Grafana dashboard ConfigMap
|
||||||
|
command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Show Grafana content provisioning summary
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
Grafana content applied.
|
||||||
|
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
|
||||||
|
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: {{ grafana_dashboard_configmap_name }}
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
data:
|
||||||
|
k8s-overview.json: |
|
||||||
|
{
|
||||||
|
"annotations": {"list": []},
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
|
"id": 1,
|
||||||
|
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||||
|
"legendFormat": "ready",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"type": "stat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"id": 2,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||||
|
"legendFormat": "cpu",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Cluster CPU Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["kubernetes", "infrastructure"],
|
||||||
|
"templating": {"list": []},
|
||||||
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "K8s Cluster Overview",
|
||||||
|
"uid": "k8s-cluster-overview",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: {{ grafana_datasource_configmap_name }}
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
datasources.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090
|
||||||
|
isDefault: true
|
||||||
|
{% if loki_enabled %}
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
|
||||||
|
isDefault: false
|
||||||
|
{% endif %}
|
||||||
@@ -52,6 +52,17 @@
|
|||||||
--timeout 10m
|
--timeout 10m
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Wait for Grafana deployment rollout
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Reset Grafana admin password in Grafana database
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} exec
|
||||||
|
"$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
|
||||||
|
-c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
- name: Write Loki values
|
- name: Write Loki values
|
||||||
template:
|
template:
|
||||||
src: loki-values.yaml.j2
|
src: loki-values.yaml.j2
|
||||||
@@ -144,18 +155,6 @@
|
|||||||
changed_when: true
|
changed_when: true
|
||||||
when: loki_enabled
|
when: loki_enabled
|
||||||
|
|
||||||
- name: Write Grafana Loki datasource manifest
|
|
||||||
template:
|
|
||||||
src: grafana-datasource-loki.yaml.j2
|
|
||||||
dest: /tmp/grafana-datasource-loki.yaml
|
|
||||||
mode: "0644"
|
|
||||||
when: loki_enabled
|
|
||||||
|
|
||||||
- name: Create Grafana Loki datasource
|
|
||||||
command: kubectl apply -f /tmp/grafana-datasource-loki.yaml
|
|
||||||
changed_when: true
|
|
||||||
when: loki_enabled
|
|
||||||
|
|
||||||
- name: Check Tailscale service readiness for Grafana
|
- name: Check Tailscale service readiness for Grafana
|
||||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||||
register: grafana_tailscale_ready
|
register: grafana_tailscale_ready
|
||||||
|
|||||||
@@ -103,6 +103,13 @@
|
|||||||
roles:
|
roles:
|
||||||
- observability
|
- observability
|
||||||
|
|
||||||
|
- name: Provision Grafana content
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- observability-content
|
||||||
|
|
||||||
- name: Finalize
|
- name: Finalize
|
||||||
hosts: localhost
|
hosts: localhost
|
||||||
connection: local
|
connection: local
|
||||||
|
|||||||
Reference in New Issue
Block a user