diff --git a/.gitea/workflows/dashboards.yml b/.gitea/workflows/dashboards.yml new file mode 100644 index 0000000..0d05e2c --- /dev/null +++ b/.gitea/workflows/dashboards.yml @@ -0,0 +1,91 @@ +name: Deploy Grafana Content + +on: + push: + branches: + - main + paths: + - "ansible/dashboards.yml" + - "ansible/roles/observability-content/**" + - ".gitea/workflows/dashboards.yml" + workflow_dispatch: + +env: + TF_VERSION: "1.7.0" + +jobs: + dashboards: + name: Grafana Content + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: Setup SSH Keys + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub + chmod 644 ~/.ssh/id_ed25519.pub + + - name: Terraform Init + working-directory: terraform + run: | + terraform init \ + -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ + -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ + -backend-config="region=auto" \ + -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ + -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ + -backend-config="skip_requesting_account_id=true" + + - name: Detect runner egress IP + run: | + RUNNER_IP=$(curl -fsSL https://api.ipify.org) + echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV" + echo "Runner egress IP: ${RUNNER_IP}" + + - name: Open SSH/API for current runner CIDR + working-directory: terraform + run: | + terraform apply \ + -target=hcloud_firewall.cluster \ + -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ + -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ + -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ + -var="allowed_ssh_ips=${RUNNER_CIDR}" \ + -var="allowed_api_ips=${RUNNER_CIDR}" \ + -auto-approve + + - name: Install Python Dependencies + run: | + apt-get update && apt-get install -y python3-pip + pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml + + - name: Install Ansible Collections + run: ansible-galaxy collection install -r ansible/requirements.yml + + - name: Generate Ansible Inventory + working-directory: ansible + run: python3 generate_inventory.py + + - name: Apply dashboards and datasources + working-directory: ansible + run: | + ansible-playbook dashboards.yml \ + -e "cluster_name=k8s-cluster" + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + + - name: Verify Grafana content resources + working-directory: ansible + run: | + ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap | grep grafana" + env: + ANSIBLE_HOST_KEY_CHECKING: "False" diff --git a/README.md b/README.md index 56a8f96..6401181 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,7 @@ This repository includes Gitea workflows for: - **terraform-plan**: Runs on PRs, shows planned changes - **terraform-apply**: Runs on main branch after merge - **ansible-deploy**: Runs after terraform apply +- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only ### Required Gitea Secrets @@ -181,6 +182,8 @@ The Ansible playbook deploys a lightweight observability stack in the `observabi - `loki` - `promtail` +Grafana content is managed as code via ConfigMaps in `ansible/roles/observability-content/`. + Services are kept internal by default, with optional declarative Tailscale exposure when the Tailscale Kubernetes Operator is healthy. ### Access Grafana and Prometheus @@ -228,6 +231,15 @@ kubectl -n tailscale-system logs deployment/operator --tail=100 Common cause: OAuth client missing tag/scopes permissions. +### Fast dashboard iteration workflow + +Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates. +It avoids full cluster provisioning and only applies Grafana content resources: + +- `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2` +- `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2` +- `ansible/dashboards.yml` + ## File Structure ``` @@ -252,12 +264,14 @@ Common cause: OAuth client missing tag/scopes permissions. │ │ ├── ccm/ │ │ ├── csi/ │ │ ├── tailscale-operator/ +│ │ ├── observability-content/ │ │ └── observability/ │ └── ansible.cfg ├── .gitea/ │ └── workflows/ │ ├── terraform.yml -│ └── ansible.yml +│ ├── ansible.yml +│ └── dashboards.yml ├── outputs/ ├── terraform.tfvars.example └── README.md diff --git a/ansible/dashboards.yml b/ansible/dashboards.yml new file mode 100644 index 0000000..1d2af03 --- /dev/null +++ b/ansible/dashboards.yml @@ -0,0 +1,7 @@ +--- +- name: Provision Grafana dashboards and datasources + hosts: control_plane[0] + become: true + + roles: + - observability-content diff --git a/ansible/roles/observability-content/defaults/main.yml b/ansible/roles/observability-content/defaults/main.yml new file mode 100644 index 0000000..1653d6a --- /dev/null +++ b/ansible/roles/observability-content/defaults/main.yml @@ -0,0 +1,5 @@ +--- +observability_namespace: "observability" +grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview" +grafana_datasource_configmap_name: "grafana-datasources-core" +loki_enabled: true diff --git a/ansible/roles/observability-content/tasks/main.yml b/ansible/roles/observability-content/tasks/main.yml new file mode 100644 index 0000000..e84751e --- /dev/null +++ b/ansible/roles/observability-content/tasks/main.yml @@ -0,0 +1,37 @@ +--- +- name: Ensure observability namespace exists + command: kubectl create namespace {{ observability_namespace }} + register: create_observability_ns + failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr + changed_when: create_observability_ns.rc == 0 + +- name: Wait for Grafana deployment rollout + command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m + changed_when: false + +- name: Write Grafana datasources ConfigMap + template: + src: grafana-datasources.yaml.j2 + dest: /tmp/grafana-datasources.yaml + mode: "0644" + +- name: Apply Grafana datasources ConfigMap + command: kubectl apply -f /tmp/grafana-datasources.yaml + changed_when: true + +- name: Write Grafana dashboard ConfigMap + template: + src: grafana-dashboard-k8s-overview.yaml.j2 + dest: /tmp/grafana-dashboard-k8s-overview.yaml + mode: "0644" + +- name: Apply Grafana dashboard ConfigMap + command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml + changed_when: true + +- name: Show Grafana content provisioning summary + debug: + msg: | + Grafana content applied. + Datasources ConfigMap: {{ grafana_datasource_configmap_name }} + Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }} diff --git a/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2 b/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2 new file mode 100644 index 0000000..9f8b50a --- /dev/null +++ b/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2 @@ -0,0 +1,60 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ grafana_dashboard_configmap_name }} + namespace: {{ observability_namespace }} + labels: + grafana_dashboard: "1" +data: + k8s-overview.json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "id": 1, + "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "legendFormat": "ready", + "refId": "A" + } + ], + "title": "Ready Nodes", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "id": 2, + "targets": [ + { + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))", + "legendFormat": "cpu", + "refId": "A" + } + ], + "title": "Cluster CPU Usage", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["kubernetes", "infrastructure"], + "templating": {"list": []}, + "time": {"from": "now-1h", "to": "now"}, + "timezone": "browser", + "title": "K8s Cluster Overview", + "uid": "k8s-cluster-overview", + "version": 1 + } diff --git a/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 new file mode 100644 index 0000000..37f8bd8 --- /dev/null +++ b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ grafana_datasource_configmap_name }} + namespace: {{ observability_namespace }} + labels: + grafana_datasource: "1" +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090 + isDefault: true +{% if loki_enabled %} + - name: Loki + type: loki + access: proxy + url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100 + isDefault: false +{% endif %} diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml index 5388790..260a628 100644 --- a/ansible/roles/observability/tasks/main.yml +++ b/ansible/roles/observability/tasks/main.yml @@ -52,6 +52,17 @@ --timeout 10m changed_when: true +- name: Wait for Grafana deployment rollout + command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m + changed_when: false + +- name: Reset Grafana admin password in Grafana database + shell: >- + kubectl -n {{ observability_namespace }} exec + "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')" + -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}' + changed_when: true + - name: Write Loki values template: src: loki-values.yaml.j2 @@ -144,18 +155,6 @@ changed_when: true when: loki_enabled -- name: Write Grafana Loki datasource manifest - template: - src: grafana-datasource-loki.yaml.j2 - dest: /tmp/grafana-datasource-loki.yaml - mode: "0644" - when: loki_enabled - -- name: Create Grafana Loki datasource - command: kubectl apply -f /tmp/grafana-datasource-loki.yaml - changed_when: true - when: loki_enabled - - name: Check Tailscale service readiness for Grafana command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}' register: grafana_tailscale_ready diff --git a/ansible/site.yml b/ansible/site.yml index 7e77180..7a16a6d 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -103,6 +103,13 @@ roles: - observability +- name: Provision Grafana content + hosts: control_plane[0] + become: true + + roles: + - observability-content + - name: Finalize hosts: localhost connection: local