diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 3335f25..d1130cf 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -226,6 +226,7 @@ jobs: -e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \ -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \ + -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \ -e "cluster_name=k8s-cluster" env: ANSIBLE_HOST_KEY_CHECKING: "False" @@ -236,6 +237,8 @@ jobs: ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass" + ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pods -o wide" + ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pvc" env: ANSIBLE_HOST_KEY_CHECKING: "False" diff --git a/README.md b/README.md index 33b41c8..345e0a6 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible | **Workers** | 4x CX33 | | **Total Cost** | €28.93/mo | | **K8s** | k3s (latest, HA) | -| **Addons** | Hetzner CCM + CSI | +| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki | | **Access** | SSH/API restricted to Tailnet | | **Bootstrap** | Terraform + Ansible | @@ -166,10 +166,40 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → ** | `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) | | `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap | | `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) | +| `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) | | `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets | | `SSH_PUBLIC_KEY` | SSH public key content | | `SSH_PRIVATE_KEY` | SSH private key content | +## Observability Stack + +The Ansible playbook deploys a lightweight observability stack in the `observability` namespace: + +- `kube-prometheus-stack` (Prometheus + Grafana) +- `loki` +- `promtail` + +Services are kept internal for tailnet-first access. + +### Access Grafana and Prometheus + +Run from a tailnet-connected machine: + +```bash +export KUBECONFIG=$(pwd)/outputs/kubeconfig + +kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80 +kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090 +``` + +Then open: + +- Grafana: http://127.0.0.1:3000 +- Prometheus: http://127.0.0.1:9090 + +Grafana user: `admin` +Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output) + ## File Structure ``` @@ -192,7 +222,8 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → ** │ │ ├── k3s-server/ │ │ ├── k3s-agent/ │ │ ├── ccm/ -│ │ └── csi/ +│ │ ├── csi/ +│ │ └── observability/ │ └── ansible.cfg ├── .gitea/ │ └── workflows/ diff --git a/ansible/roles/observability/defaults/main.yml b/ansible/roles/observability/defaults/main.yml new file mode 100644 index 0000000..4e33403 --- /dev/null +++ b/ansible/roles/observability/defaults/main.yml @@ -0,0 +1,16 @@ +--- +observability_namespace: "observability" + +prometheus_chart_version: "68.4.4" +loki_chart_version: "6.24.0" +promtail_chart_version: "6.16.6" + +grafana_admin_password: "" + +prometheus_storage_size: "10Gi" +grafana_storage_size: "5Gi" +loki_storage_size: "10Gi" + +prometheus_storage_class: "local-path" +grafana_storage_class: "local-path" +loki_storage_class: "local-path" diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml new file mode 100644 index 0000000..f4a45ff --- /dev/null +++ b/ansible/roles/observability/tasks/main.yml @@ -0,0 +1,166 @@ +--- +- name: Check if Helm is installed + command: helm version --short + register: helm_check + changed_when: false + failed_when: false + +- name: Install Helm + shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + when: helm_check.rc != 0 + changed_when: true + +- name: Ensure observability namespace exists + command: kubectl create namespace {{ observability_namespace }} + register: create_observability_ns + failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr + changed_when: create_observability_ns.rc == 0 + +- name: Set Grafana admin password + set_fact: + grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}" + +- name: Write kube-prometheus-stack values + copy: + dest: /tmp/kube-prometheus-stack-values.yaml + mode: "0644" + content: | + grafana: + enabled: true + adminPassword: {{ grafana_password_effective }} + persistence: + enabled: true + storageClassName: {{ grafana_storage_class }} + size: {{ grafana_storage_size }} + service: + type: ClusterIP + prometheus: + prometheusSpec: + retention: 7d + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: {{ prometheus_storage_class }} + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: {{ prometheus_storage_size }} + alertmanager: + enabled: false + kubeEtcd: + enabled: false + kubeControllerManager: + enabled: false + kubeScheduler: + enabled: false + +- name: Add Prometheus Helm repo + command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + register: add_prom_repo + failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr + changed_when: add_prom_repo.rc == 0 + +- name: Add Grafana Helm repo + command: helm repo add grafana https://grafana.github.io/helm-charts + register: add_grafana_repo + failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr + changed_when: add_grafana_repo.rc == 0 + +- name: Update Helm repos + command: helm repo update + changed_when: false + +- name: Install kube-prometheus-stack + command: >- + helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack + --namespace {{ observability_namespace }} + --version {{ prometheus_chart_version }} + --values /tmp/kube-prometheus-stack-values.yaml + --wait + --timeout 10m + changed_when: true + +- name: Write Loki values + copy: + dest: /tmp/loki-values.yaml + mode: "0644" + content: | + loki: + auth_enabled: false + commonConfig: + replication_factor: 1 + storage: + type: filesystem + singleBinary: + replicas: 1 + persistence: + enabled: true + storageClass: {{ loki_storage_class }} + size: {{ loki_storage_size }} + test: + enabled: false + monitoring: + selfMonitoring: + enabled: false + lokiCanary: + enabled: false + +- name: Install Loki + command: >- + helm upgrade --install loki grafana/loki + --namespace {{ observability_namespace }} + --version {{ loki_chart_version }} + --values /tmp/loki-values.yaml + --wait + --timeout 10m + changed_when: true + +- name: Write Promtail values + copy: + dest: /tmp/promtail-values.yaml + mode: "0644" + content: | + config: + clients: + - url: http://loki-gateway.{{ observability_namespace }}.svc.cluster.local/loki/api/v1/push + +- name: Install Promtail + command: >- + helm upgrade --install promtail grafana/promtail + --namespace {{ observability_namespace }} + --version {{ promtail_chart_version }} + --values /tmp/promtail-values.yaml + --wait + --timeout 10m + changed_when: true + +- name: Create Grafana Loki datasource + command: kubectl apply -f - + args: + stdin: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: grafana-datasource-loki + namespace: {{ observability_namespace }} + labels: + grafana_datasource: "1" + data: + loki-datasource.yaml: | + apiVersion: 1 + datasources: + - name: Loki + type: loki + access: proxy + url: http://loki-gateway.{{ observability_namespace }}.svc.cluster.local + isDefault: false + changed_when: true + +- name: Show observability access details + debug: + msg: | + Observability stack deployed. + Namespace: {{ observability_namespace }} + Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80 + Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090 + Grafana admin password: {{ grafana_password_effective }} diff --git a/ansible/site.yml b/ansible/site.yml index ded1dbf..1770881 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -89,6 +89,13 @@ roles: - csi +- name: Deploy observability stack + hosts: control_plane[0] + become: true + + roles: + - observability + - name: Finalize hosts: localhost connection: local