feat: manage grafana content as code with fast dashboard workflow
This commit is contained in:
91
.gitea/workflows/dashboards.yml
Normal file
91
.gitea/workflows/dashboards.yml
Normal file
@@ -0,0 +1,91 @@
|
||||
name: Deploy Grafana Content
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "ansible/dashboards.yml"
|
||||
- "ansible/roles/observability-content/**"
|
||||
- ".gitea/workflows/dashboards.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
|
||||
jobs:
|
||||
dashboards:
|
||||
name: Grafana Content
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Detect runner egress IP
|
||||
run: |
|
||||
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||
echo "Runner egress IP: ${RUNNER_IP}"
|
||||
|
||||
- name: Open SSH/API for current runner CIDR
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform apply \
|
||||
-target=hcloud_firewall.cluster \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||
-auto-approve
|
||||
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||
|
||||
- name: Install Ansible Collections
|
||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||
|
||||
- name: Generate Ansible Inventory
|
||||
working-directory: ansible
|
||||
run: python3 generate_inventory.py
|
||||
|
||||
- name: Apply dashboards and datasources
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible-playbook dashboards.yml \
|
||||
-e "cluster_name=k8s-cluster"
|
||||
env:
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
|
||||
- name: Verify Grafana content resources
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap | grep grafana"
|
||||
env:
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
16
README.md
16
README.md
@@ -152,6 +152,7 @@ This repository includes Gitea workflows for:
|
||||
- **terraform-plan**: Runs on PRs, shows planned changes
|
||||
- **terraform-apply**: Runs on main branch after merge
|
||||
- **ansible-deploy**: Runs after terraform apply
|
||||
- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
|
||||
|
||||
### Required Gitea Secrets
|
||||
|
||||
@@ -181,6 +182,8 @@ The Ansible playbook deploys a lightweight observability stack in the `observabi
|
||||
- `loki`
|
||||
- `promtail`
|
||||
|
||||
Grafana content is managed as code via ConfigMaps in `ansible/roles/observability-content/`.
|
||||
|
||||
Services are kept internal by default, with optional declarative Tailscale exposure when the Tailscale Kubernetes Operator is healthy.
|
||||
|
||||
### Access Grafana and Prometheus
|
||||
@@ -228,6 +231,15 @@ kubectl -n tailscale-system logs deployment/operator --tail=100
|
||||
|
||||
Common cause: OAuth client missing tag/scopes permissions.
|
||||
|
||||
### Fast dashboard iteration workflow
|
||||
|
||||
Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
|
||||
It avoids full cluster provisioning and only applies Grafana content resources:
|
||||
|
||||
- `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
|
||||
- `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
|
||||
- `ansible/dashboards.yml`
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
@@ -252,12 +264,14 @@ Common cause: OAuth client missing tag/scopes permissions.
|
||||
│ │ ├── ccm/
|
||||
│ │ ├── csi/
|
||||
│ │ ├── tailscale-operator/
|
||||
│ │ ├── observability-content/
|
||||
│ │ └── observability/
|
||||
│ └── ansible.cfg
|
||||
├── .gitea/
|
||||
│ └── workflows/
|
||||
│ ├── terraform.yml
|
||||
│ └── ansible.yml
|
||||
│ ├── ansible.yml
|
||||
│ └── dashboards.yml
|
||||
├── outputs/
|
||||
├── terraform.tfvars.example
|
||||
└── README.md
|
||||
|
||||
7
ansible/dashboards.yml
Normal file
7
ansible/dashboards.yml
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
- name: Provision Grafana dashboards and datasources
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- observability-content
|
||||
5
ansible/roles/observability-content/defaults/main.yml
Normal file
5
ansible/roles/observability-content/defaults/main.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
observability_namespace: "observability"
|
||||
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
|
||||
grafana_datasource_configmap_name: "grafana-datasources-core"
|
||||
loki_enabled: true
|
||||
37
ansible/roles/observability-content/tasks/main.yml
Normal file
37
ansible/roles/observability-content/tasks/main.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
---
|
||||
- name: Ensure observability namespace exists
|
||||
command: kubectl create namespace {{ observability_namespace }}
|
||||
register: create_observability_ns
|
||||
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||
changed_when: create_observability_ns.rc == 0
|
||||
|
||||
- name: Wait for Grafana deployment rollout
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Write Grafana datasources ConfigMap
|
||||
template:
|
||||
src: grafana-datasources.yaml.j2
|
||||
dest: /tmp/grafana-datasources.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply Grafana datasources ConfigMap
|
||||
command: kubectl apply -f /tmp/grafana-datasources.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Write Grafana dashboard ConfigMap
|
||||
template:
|
||||
src: grafana-dashboard-k8s-overview.yaml.j2
|
||||
dest: /tmp/grafana-dashboard-k8s-overview.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply Grafana dashboard ConfigMap
|
||||
command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Show Grafana content provisioning summary
|
||||
debug:
|
||||
msg: |
|
||||
Grafana content applied.
|
||||
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
|
||||
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
|
||||
@@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ grafana_dashboard_configmap_name }}
|
||||
namespace: {{ observability_namespace }}
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
k8s-overview.json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"legendFormat": "ready",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ready Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||
"legendFormat": "cpu",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cluster CPU Usage",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["kubernetes", "infrastructure"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"timezone": "browser",
|
||||
"title": "K8s Cluster Overview",
|
||||
"uid": "k8s-cluster-overview",
|
||||
"version": 1
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ grafana_datasource_configmap_name }}
|
||||
namespace: {{ observability_namespace }}
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
{% if loki_enabled %}
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
|
||||
isDefault: false
|
||||
{% endif %}
|
||||
@@ -52,6 +52,17 @@
|
||||
--timeout 10m
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for Grafana deployment rollout
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Reset Grafana admin password in Grafana database
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec
|
||||
"$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
|
||||
-c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
|
||||
changed_when: true
|
||||
|
||||
- name: Write Loki values
|
||||
template:
|
||||
src: loki-values.yaml.j2
|
||||
@@ -144,18 +155,6 @@
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Write Grafana Loki datasource manifest
|
||||
template:
|
||||
src: grafana-datasource-loki.yaml.j2
|
||||
dest: /tmp/grafana-datasource-loki.yaml
|
||||
mode: "0644"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Create Grafana Loki datasource
|
||||
command: kubectl apply -f /tmp/grafana-datasource-loki.yaml
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Check Tailscale service readiness for Grafana
|
||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||
register: grafana_tailscale_ready
|
||||
|
||||
@@ -103,6 +103,13 @@
|
||||
roles:
|
||||
- observability
|
||||
|
||||
- name: Provision Grafana content
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- observability-content
|
||||
|
||||
- name: Finalize
|
||||
hosts: localhost
|
||||
connection: local
|
||||
|
||||
Reference in New Issue
Block a user