diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 7a70d60..a1a6cc6 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -304,11 +304,8 @@ jobs: kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s - # Observability stack deferred - complex helm release timing out, debug separately - # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s - # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s - - name: Wait for Rancher and fix backup operator + - name: Wait for Rancher and backup operator env: KUBECONFIG: outputs/kubeconfig run: | @@ -320,15 +317,6 @@ jobs: echo "Waiting for rancher-backup operator..." kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true - echo "Patching default SA in cattle-resources-system..." - kubectl patch serviceaccount default -n cattle-resources-system -p '{"automountServiceAccountToken": false}' || true - - echo "Cleaning up failed patch-sa jobs..." - kubectl delete job -n cattle-resources-system rancher-backup-patch-sa --ignore-not-found=true || true - - echo "Force reconciling rancher-backup HelmRelease..." - flux reconcile helmrelease rancher-backup -n flux-system --timeout=5m || true - - name: Restore Rancher from latest B2 backup env: KUBECONFIG: outputs/kubeconfig diff --git a/AGENTS.md b/AGENTS.md index 69cbcbd..6573355 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,18 +18,19 @@ Repository guide for agentic contributors working in this repo. - **cert-manager** is required — Tailscale LoadBalancer does L4 TCP passthrough, so Rancher serves its own TLS. - **Secrets flow**: Doppler → `ClusterSecretStore` (doppler-hetznerterra) → `ExternalSecret` resources → k8s Secrets. - Rancher is reachable only over Tailscale at `https://rancher.silverside-gopher.ts.net/`. +- Grafana, Prometheus, and Flux UI are also exposed via dedicated Tailscale LoadBalancer services at `http://grafana.silverside-gopher.ts.net/`, `http://prometheus.silverside-gopher.ts.net/`, `http://flux.silverside-gopher.ts.net:9001/`. ## Important Files - `terraform/main.tf` — provider and version pins - `terraform/variables.tf` — input surface and defaults - `terraform/firewall.tf` — firewall rules (tailnet CIDR, internal cluster ports) -- `ansible/site.yml` — ordered bootstrap playbook (roles: common → k3s-server → ccm → k3s-agent → private-access → doppler → tailscale-cleanup) +- `ansible/site.yml` — ordered bootstrap playbook (roles: common → k3s-server → ccm → k3s-agent → doppler → tailscale-cleanup) - `ansible/generate_inventory.py` — renders `ansible/inventory.ini` from Terraform outputs via Jinja2 - `clusters/prod/flux-system/` — Flux GitRepository and top-level Kustomization resources - `infrastructure/addons/kustomization.yaml` — root addon graph with dependency ordering - `infrastructure/addons//` — each addon is a self-contained dir with its own `kustomization.yaml` -- `.gitea/workflows/deploy.yml` — canonical CI: terraform → ansible → flux bootstrap → rancher fix → B2 restore +- `.gitea/workflows/deploy.yml` — canonical CI: terraform → ansible → flux bootstrap → B2 restore → health checks ## Build / Validate / Test @@ -109,7 +110,7 @@ Repository guide for agentic contributors working in this repo. ## Known Issues & Workarounds -- **rancher-backup post-install job** (`rancher-backup-patch-sa`) fails because `rancher/kuberlr-kubectl` can't download kubectl. CI patches the SA and deletes the failed job. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead. +- **rancher-backup post-install job** (`rancher-backup-patch-sa`) uses a postRenderer in the HelmRelease to replace the broken `rancher/kuberlr-kubectl` image with `rancher/kubectl`. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead. - **B2 ExternalSecret** must use key names `accessKey` and `secretKey` (not `aws_access_key_id`/`aws_secret_access_key`). - **Stale Tailscale devices**: After cluster rebuild, delete stale offline `rancher` devices before booting. The `tailscale-cleanup` Ansible role handles this via the Tailscale API. - **Restricted B2 keys**: `b2_authorize_account` may return `allowed.bucketId: null`. CI falls back to `b2_list_buckets` to resolve bucket ID by name. @@ -125,7 +126,7 @@ Repository guide for agentic contributors working in this repo. 1. Terraform: fmt check → init → validate → import existing servers → plan → apply (main only) 2. Ansible: install deps → generate inventory → run site.yml with extra vars (secrets injected from Gitea) 3. Flux bootstrap: install kubectl/flux → rewrite kubeconfig → apply CRDs → apply graph → wait for addons -4. Rancher post-install: wait for Rancher/backup operator → patch SA → clean failed jobs → force reconcile +4. Rancher wait: wait for Rancher and backup operator to be ready 5. B2 restore: authorize B2 → find latest backup → create Restore CR → poll until ready 6. Health checks: nodes, Flux objects, pods, storage class diff --git a/ansible/roles/private-access/tasks/main.yml b/ansible/roles/private-access/tasks/main.yml deleted file mode 100644 index 8485cb4..0000000 --- a/ansible/roles/private-access/tasks/main.yml +++ /dev/null @@ -1,86 +0,0 @@ ---- -- name: Create systemd unit for Grafana private access - template: - src: kubectl-port-forward.service.j2 - dest: /etc/systemd/system/k8s-portforward-grafana.service - mode: "0644" - vars: - unit_description: Port-forward Grafana for Tailscale access - unit_namespace: observability - unit_target: svc/observability-kube-prometheus-stack-grafana - unit_local_port: 13080 - unit_remote_port: 80 - -- name: Create systemd unit for Prometheus private access - template: - src: kubectl-port-forward.service.j2 - dest: /etc/systemd/system/k8s-portforward-prometheus.service - mode: "0644" - vars: - unit_description: Port-forward Prometheus for Tailscale access - unit_namespace: observability - unit_target: svc/observability-kube-prometh-prometheus - unit_local_port: 19090 - unit_remote_port: 9090 - -- name: Create systemd unit for Flux UI private access - template: - src: kubectl-port-forward.service.j2 - dest: /etc/systemd/system/k8s-portforward-flux-ui.service - mode: "0644" - vars: - unit_description: Port-forward Flux UI for Tailscale access - unit_namespace: flux-system - unit_target: svc/flux-system-weave-gitops - unit_local_port: 19001 - unit_remote_port: 9001 - -- name: Create systemd unit for Rancher HTTP private access - template: - src: kubectl-port-forward.service.j2 - dest: /etc/systemd/system/k8s-portforward-rancher.service - mode: "0644" - vars: - unit_description: Port-forward Rancher HTTP for Tailscale access - unit_namespace: cattle-system - unit_target: svc/cattle-system-rancher - unit_local_port: 19442 - unit_remote_port: 80 - -- name: Create systemd unit for Rancher HTTPS private access - template: - src: kubectl-port-forward.service.j2 - dest: /etc/systemd/system/k8s-portforward-rancher-https.service - mode: "0644" - vars: - unit_description: Port-forward Rancher HTTPS for Tailscale access - unit_namespace: cattle-system - unit_target: svc/cattle-system-rancher - unit_local_port: 19443 - unit_remote_port: 443 - -- name: Reload systemd - systemd: - daemon_reload: true - -- name: Enable and start private access port-forward services - systemd: - name: "{{ item }}" - enabled: true - state: started - loop: - - k8s-portforward-grafana.service - - k8s-portforward-prometheus.service - - k8s-portforward-flux-ui.service - - k8s-portforward-rancher.service - - k8s-portforward-rancher-https.service - -- name: Configure Tailscale Serve for private access endpoints - shell: >- - tailscale serve reset && - tailscale serve --bg --tcp={{ private_access_grafana_port }} tcp://127.0.0.1:13080 && - tailscale serve --bg --tcp={{ private_access_prometheus_port }} tcp://127.0.0.1:19090 && - tailscale serve --bg --tcp={{ private_access_flux_port }} tcp://127.0.0.1:19001 && - tailscale serve --bg --tcp={{ private_access_rancher_port }} tcp://127.0.0.1:19442 && - tailscale serve --bg --tcp=9443 tcp://127.0.0.1:19443 - changed_when: true diff --git a/ansible/roles/private-access/templates/kubectl-port-forward.service.j2 b/ansible/roles/private-access/templates/kubectl-port-forward.service.j2 deleted file mode 100644 index 6abb573..0000000 --- a/ansible/roles/private-access/templates/kubectl-port-forward.service.j2 +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description={{ unit_description }} -After=network-online.target k3s.service -Wants=network-online.target - -[Service] -Type=simple -Restart=always -RestartSec=5 -ExecStart=/usr/local/bin/kubectl -n {{ unit_namespace }} port-forward --address 127.0.0.1 {{ unit_target }} {{ unit_local_port }}:{{ unit_remote_port }} - -[Install] -WantedBy=multi-user.target diff --git a/infrastructure/addons/kustomization-traefik-config.yaml b/infrastructure/addons/kustomization-traefik-config.yaml deleted file mode 100644 index 12e8c4e..0000000 --- a/infrastructure/addons/kustomization-traefik-config.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: kustomize.toolkit.fluxcd.io/v1 -kind: Kustomization -metadata: - name: addon-traefik-config - namespace: flux-system -spec: - interval: 10m - prune: true - sourceRef: - kind: GitRepository - name: platform - path: ./infrastructure/addons/traefik-config - wait: true - timeout: 5m - suspend: false - dependsOn: - - name: addon-tailscale-operator - - name: addon-tailscale-proxyclass diff --git a/infrastructure/addons/observability/grafana-ingress.yaml b/infrastructure/addons/observability/grafana-ingress.yaml deleted file mode 100644 index 7156c5d..0000000 --- a/infrastructure/addons/observability/grafana-ingress.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: grafana - namespace: observability -spec: - ingressClassName: traefik - rules: - - http: - paths: - - path: /grafana - pathType: Prefix - backend: - service: - name: observability-kube-prometheus-stack-grafana - port: - number: 80 diff --git a/infrastructure/addons/observability/prometheus-ingress.yaml b/infrastructure/addons/observability/prometheus-ingress.yaml deleted file mode 100644 index 3251184..0000000 --- a/infrastructure/addons/observability/prometheus-ingress.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: prometheus - namespace: observability -spec: - ingressClassName: traefik - rules: - - http: - paths: - - path: /prometheus - pathType: Prefix - backend: - service: - name: observability-kube-prometh-prometheus - port: - number: 9090 diff --git a/infrastructure/addons/observability/traefik-tailscale-service.yaml b/infrastructure/addons/observability/traefik-tailscale-service.yaml deleted file mode 100644 index ddcb44e..0000000 --- a/infrastructure/addons/observability/traefik-tailscale-service.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: traefik-tailscale - namespace: kube-system - annotations: - tailscale.com/hostname: observability - tailscale.com/proxy-class: infra-stable -spec: - type: LoadBalancer - loadBalancerClass: tailscale - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - protocol: TCP - targetPort: web - - name: websecure - port: 443 - protocol: TCP - targetPort: websecure - - name: flux - port: 9001 - protocol: TCP - targetPort: 9001