diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index e71b5ba..c220492 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -406,6 +406,13 @@ jobs: env: ANSIBLE_HOST_KEY_CHECKING: "False" + - name: Post-deploy tailnet smoke checks + working-directory: ansible + run: | + ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh" + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + - name: Upload Kubeconfig uses: actions/upload-artifact@v3 with: diff --git a/AGENTS.md b/AGENTS.md index 377ae49..c4d07f5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -112,7 +112,7 @@ Repository guide for agentic contributors working in this repo. - **rancher-backup post-install job** (`rancher-backup-patch-sa`) uses a postRenderer in the HelmRelease to replace the broken `rancher/kuberlr-kubectl` image with `rancher/kubectl`. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead. - **B2 ExternalSecret** must use key names `accessKey` and `secretKey` (not `aws_access_key_id`/`aws_secret_access_key`). -- **Stale Tailscale devices**: After cluster rebuild, delete stale offline `rancher` devices before booting. The `tailscale-cleanup` Ansible role handles this via the Tailscale API. +- **Stale Tailscale devices**: Before service proxies exist, `tailscale-cleanup` removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices via the Tailscale API. Once the cluster has Tailscale services, cleanup is skipped so live proxy nodes are not deleted out from under the operator. - **Restricted B2 keys**: `b2_authorize_account` may return `allowed.bucketId: null`. CI falls back to `b2_list_buckets` to resolve bucket ID by name. ## Secrets / Security @@ -128,7 +128,7 @@ Repository guide for agentic contributors working in this repo. 3. Flux bootstrap: install kubectl/flux → rewrite kubeconfig → apply CRDs → apply graph → wait for addons 4. Rancher wait: wait for Rancher and backup operator to be ready 5. B2 restore: authorize B2 → find latest backup → create Restore CR → poll until ready -6. Health checks: nodes, Flux objects, pods, storage class +6. Health checks: nodes, Flux objects, pods, storage class, and Tailscale URL smoke tests from `control_plane[0]` ## Editing Practices diff --git a/README.md b/README.md index 7424b57..6d2431e 100644 --- a/README.md +++ b/README.md @@ -7,18 +7,11 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible | Component | Details | |-----------|---------| | **Control Plane** | 3x CX23 (HA) | -| **Workers** | 4x CX33 | -| **Total Cost** | €28.93/mo | +| **Workers** | 3x CX33 | | **K8s** | k3s (latest, HA) | | **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki | -| **Access** | SSH/API and Rancher UI restricted to Tailnet | -| **Bootstrap** | Terraform + Ansible | - -### Cluster Resources -- 22 vCPU total (6 CP + 16 workers) -- 44 GB RAM total (12 CP + 32 workers) -- 440 GB SSD storage -- 140 TB bandwidth allocation +| **Access** | SSH/API and private services restricted to Tailnet | +| **Bootstrap** | Terraform + Ansible + Flux | ## Prerequisites @@ -143,15 +136,14 @@ export KUBECONFIG=$(pwd)/outputs/kubeconfig kubectl get nodes ``` -Kubeconfig endpoint is rewritten to the primary control-plane tailnet hostname (`k8s-cluster-cp-1.`). +Use `scripts/refresh-kubeconfig.sh ` to refresh kubeconfig against the primary control-plane public IP after rebuilds. ## Gitea CI/CD This repository includes Gitea workflows for: -- **terraform-plan**: Runs on PRs, shows planned changes -- **terraform-apply**: Runs on main branch after merge -- **ansible-deploy**: Runs after terraform apply +- **deploy**: End-to-end Terraform + Ansible + Flux bootstrap + restore + health checks +- **destroy**: Cluster teardown with backup-aware cleanup - **dashboards**: Fast workflow that updates Grafana datasources/dashboards only ### Required Gitea Secrets @@ -181,13 +173,13 @@ This repo uses Flux for continuous reconciliation after Terraform + Ansible boot ### Stable private-only baseline -The current default target is a deliberately simplified baseline: +The current default target is the HA private baseline: -- `1` control plane node -- `2` worker nodes +- `3` control plane nodes +- `3` worker nodes - private Hetzner network only -- Tailscale for operator access -- Flux-managed core addons only +- Tailscale for operator and service access +- Flux-managed platform addons with `apps` suspended by default Detailed phase gates and success criteria live in `STABLE_BASELINE.md`. @@ -232,31 +224,30 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed ### Current addon status - Core infrastructure addons are Flux-managed from `infrastructure/addons/`. -- Active Flux addons for stable baseline: `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`. -- Deferred addons: `addon-ccm`, `addon-csi`, `addon-observability`, `addon-observability-content` (to be added after baseline is stable). -- Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons. +- Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`. +- `apps` remains suspended until workload rollout is explicitly enabled. +- Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization. - Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations. ### Rancher access -- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/dashboard/`. +- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`. - The public Hetzner load balancer path is not used for Rancher. -- Rancher uses the CNPG-backed PostgreSQL cluster in `cnpg-cluster`. +- Rancher stores state in embedded etcd; no external database is used. ### Stable baseline acceptance A rebuild is considered successful only when all of the following pass without manual intervention: -- Terraform create succeeds for the default `1` control plane and `2` workers. +- Terraform create succeeds for the default `3` control planes and `3` workers. - Ansible bootstrap succeeds end-to-end. - All nodes become `Ready`. - Flux core reconciliation is healthy. - External Secrets Operator is ready. - Tailscale operator is ready. +- Tailnet smoke checks pass for Rancher, Grafana, and Prometheus. - Terraform destroy succeeds cleanly or succeeds after workflow retries. -_Note: Observability stack (Grafana/Prometheus) is deferred and will be added once the core platform baseline is stable._ - ## Observability Stack Flux deploys a lightweight observability stack in the `observability` namespace: @@ -301,9 +292,11 @@ Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated val export KUBECONFIG=$(pwd)/outputs/kubeconfig kubectl -n tailscale-system get pods -kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus -kubectl -n observability describe svc kube-prometheus-stack-grafana | grep TailscaleProxyReady -kubectl -n observability describe svc kube-prometheus-stack-prometheus | grep TailscaleProxyReady +kubectl -n cattle-system get svc rancher-tailscale +kubectl -n observability get svc grafana-tailscale prometheus-tailscale +kubectl -n cattle-system describe svc rancher-tailscale | grep TailscaleProxyReady +kubectl -n observability describe svc grafana-tailscale | grep TailscaleProxyReady +kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxyReady ``` If `TailscaleProxyReady=False`, check: diff --git a/STABLE_BASELINE.md b/STABLE_BASELINE.md index 4640deb..58d43ed 100644 --- a/STABLE_BASELINE.md +++ b/STABLE_BASELINE.md @@ -8,8 +8,11 @@ This document defines the current engineering target for this repository. - 3 workers - Hetzner Load Balancer for Kubernetes API - private Hetzner network -- Tailscale operator access -- Rancher UI exposed only through Tailscale (`rancher.silverside-gopher.ts.net`) +- Tailscale operator access and service exposure +- Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`) +- Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`) +- Prometheus exposed through Tailscale (`prometheus.silverside-gopher.ts.net:9090`) +- `apps` Kustomization suspended by default ## In Scope @@ -21,12 +24,15 @@ This document defines the current engineering target for this repository. - **Hetzner CSI for persistent volumes (via Flux)** - Flux core reconciliation - External Secrets Operator with Doppler -- Tailscale private access +- Tailscale private access and smoke-check validation +- cert-manager +- Rancher and rancher-backup +- Observability stack (Grafana, Prometheus, Loki, Promtail) - Persistent volume provisioning validated ## Deferred for Later Phases -- Observability stack (deferred - complex helm release needs separate debugging) +- app workloads in `apps/` ## Out of Scope @@ -49,17 +55,10 @@ This document defines the current engineering target for this repository. 9. **CSI deploys and creates `hcloud-volumes` StorageClass**. 10. **PVC provisioning tested and working**. 11. External Secrets sync required secrets. -12. Tailscale private access works, including Rancher UI access. -13. Terraform destroy succeeds cleanly or via workflow retry. +12. Tailscale private access works for Rancher, Grafana, and Prometheus. +13. CI smoke checks pass for Tailscale DNS resolution, `tailscale ping`, and HTTP reachability. +14. Terraform destroy succeeds cleanly or via workflow retry. ## Success Criteria -✅ **ACHIEVED** - HA Cluster with CCM/CSI: -- Build 1: Initial CCM/CSI deployment and validation (2026-03-23) -- Build 2: Full destroy/rebuild cycle successful (2026-03-23) - -🔄 **IN PROGRESS** - HA Control Plane Validation: -- Build 3: Deploy 3-3 topology with Load Balancer -- Build 4: Destroy/rebuild to validate HA configuration - -Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes. +Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes, no manual `kubectl` patching, and no manual Tailscale proxy recreation. diff --git a/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml b/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml index 6be113f..deb37ce 100644 --- a/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml +++ b/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml @@ -9,7 +9,7 @@ spec: chart: spec: chart: tailscale-operator - version: 1.95.91 + version: 1.96.5 sourceRef: kind: HelmRepository name: tailscale diff --git a/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml b/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml index bc283f7..708d5f7 100644 --- a/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml +++ b/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml @@ -5,4 +5,4 @@ metadata: namespace: flux-system spec: interval: 1h - url: https://pkgs.tailscale.com/unstable/helmcharts + url: https://pkgs.tailscale.com/helmcharts diff --git a/scripts/smoke-check-tailnet-services.sh b/scripts/smoke-check-tailnet-services.sh new file mode 100644 index 0000000..db513ef --- /dev/null +++ b/scripts/smoke-check-tailnet-services.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +retry() { + local attempts="$1" + local delay_seconds="$2" + shift 2 + + local attempt=1 + until "$@"; do + if [ "$attempt" -ge "$attempts" ]; then + return 1 + fi + sleep "$delay_seconds" + attempt=$((attempt + 1)) + done +} + +service_proxy_ready() { + local namespace="$1" + local service_name="$2" + + kubectl get svc "$service_name" -n "$namespace" \ + -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}' 2>/dev/null \ + | grep -qx 'True' +} + +assigned_hostname_matches() { + local namespace="$1" + local service_name="$2" + local expected_hostname="$3" + + kubectl get svc "$service_name" -n "$namespace" \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null \ + | grep -qx "$expected_hostname" +} + +dns_resolves() { + local hostname="$1" + getent hosts "$hostname" >/dev/null 2>&1 +} + +tailscale_ping_succeeds() { + local hostname="$1" + tailscale ping -c 1 "$hostname" >/dev/null 2>&1 +} + +http_status_is_expected() { + local url="$1" + local status + + status="$(curl -skS -o /dev/null -w '%{http_code}' --max-time 15 "$url" || true)" + + case "$status" in + 200|301|302|401|403) + return 0 + ;; + *) + echo "Unexpected HTTP status for $url: $status" >&2 + return 1 + ;; + esac +} + +check_service() { + local namespace="$1" + local service_name="$2" + local hostname="$3" + local url="$4" + + echo "Checking $namespace/$service_name -> $hostname" + retry 18 10 service_proxy_ready "$namespace" "$service_name" + retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname" + retry 18 10 dns_resolves "$hostname" + retry 18 10 tailscale_ping_succeeds "$hostname" + retry 18 10 http_status_is_expected "$url" + + echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)" + echo "HTTP status OK for $url" +} + +check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/" +check_service "observability" "grafana-tailscale" "grafana.silverside-gopher.ts.net" "http://grafana.silverside-gopher.ts.net/" +check_service "observability" "prometheus-tailscale" "prometheus.silverside-gopher.ts.net" "http://prometheus.silverside-gopher.ts.net:9090/"