name: Deploy Cluster

on:
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
  workflow_dispatch:

env:
  TF_VERSION: "1.7.0"
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
  TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
  TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
  TF_VAR_proxmox_insecure: "true"
  TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
  TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}

jobs:
  terraform:
    name: Terraform
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Terraform Format Check
        working-directory: terraform
        run: terraform fmt -check -recursive

      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"

      - name: Terraform Validate
        working-directory: terraform
        run: terraform validate

      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub

      - name: Terraform Plan
        id: plan
        working-directory: terraform
        run: |
          terraform plan \
            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
            -out=tfplan \
            -no-color
        continue-on-error: true

      - name: Post Plan to PR
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            const output = `#### Terraform Plan
            \`\`\`
            ${{ steps.plan.outputs.stdout }}
            \`\`\``;
            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: output
            });

      - name: Fail if plan failed
        if: steps.plan.outcome == 'failure'
        run: exit 1

      - name: Cleanup orphan Proxmox cloud-init volumes
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import os
          import ssl
          import urllib.error
          import urllib.parse
          import urllib.request

          endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
          token_id = os.environ["TF_VAR_proxmox_api_token_id"]
          token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
          insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
          node = "flex"
          storage = "Flash"
          vm_ids = [200, 201, 202, 210, 211, 212, 213, 214]
          context = ssl._create_unverified_context() if insecure else None
          headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}

          def request(method, path):
              req = urllib.request.Request(
                  f"{endpoint}/api2/json{path}",
                  method=method,
                  headers=headers,
              )
              return urllib.request.urlopen(req, context=context, timeout=30)

          def vm_exists(vmid):
              try:
                  request("GET", f"/nodes/{node}/qemu/{vmid}/status/current").close()
                  return True
              except urllib.error.HTTPError as err:
                  if err.code == 404:
                      return False
                  if err.code == 500 and "conf' does not exist" in err.reason:
                      return False
                  raise

          for vmid in vm_ids:
              if vm_exists(vmid):
                  print(f"VM {vmid} exists; keeping cloud-init volume")
                  continue

              volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
              try:
                  request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}").close()
                  print(f"Deleted orphan cloud-init volume for VM {vmid}")
              except urllib.error.HTTPError as err:
                  if err.code == 404:
                      print(f"No orphan cloud-init volume for VM {vmid}")
                      continue
                  raise
          PY

      - name: Terraform Apply
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        working-directory: terraform
        run: |
          terraform apply \
            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
            -auto-approve

      - name: Save Terraform Outputs
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        run: |
          mkdir -p outputs
          terraform output -json > outputs/terraform_outputs.json
        working-directory: terraform

      - name: Upload Outputs
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        uses: actions/upload-artifact@v3
        with:
          name: terraform-outputs
          path: outputs/terraform_outputs.json

  ansible:
    name: Ansible
    runs-on: ubuntu-latest
    needs: terraform
    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub

      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"

      - name: Get Terraform Outputs
        working-directory: terraform
        run: |
          mkdir -p ../outputs
          terraform output -json > ../outputs/terraform_outputs.json

      - name: Install Python Dependencies
        run: |
          apt-get update && apt-get install -y python3-pip
          pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml

      - name: Install Ansible Collections
        run: ansible-galaxy collection install -r ansible/requirements.yml

      - name: Generate Ansible Inventory
        working-directory: ansible
        run: python3 generate_inventory.py

      - name: Run Ansible Playbook
        working-directory: ansible
        run: |
          ansible-playbook site.yml \
            -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
            -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
            -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
            -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
            -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
            -e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
            -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"

      - name: Install kubectl
        run: |
          curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
          chmod +x /usr/local/bin/kubectl

      - name: Rewrite kubeconfig for runner-reachable API
        working-directory: terraform
        run: |
          set -euo pipefail
          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
          sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig

      - name: Bootstrap Flux source and reconciliation graph
        env:
          KUBECONFIG: outputs/kubeconfig
          FLUX_GIT_HOST: 64.176.189.59
          FLUX_GIT_PORT: "2222"
        run: |
          set -euo pipefail
          flux_rollout_status() {
            local deployment="$1"
            if ! kubectl -n flux-system rollout status "deployment/${deployment}" --timeout=900s; then
              kubectl -n flux-system get pods -o wide
              kubectl -n flux-system describe deployment "${deployment}"
              kubectl -n flux-system describe pods -l "app=${deployment}"
              exit 1
            fi
          }

          wait_for_resource() {
            local namespace="$1"
            local resource="$2"
            local timeout_seconds="$3"
            local elapsed=0

            until {
              if [ -n "${namespace}" ]; then
                kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
              else
                kubectl get "${resource}" >/dev/null 2>&1
              fi
            }; do
              if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
                echo "Timed out waiting for ${resource} to exist" >&2
                kubectl -n flux-system get kustomizations,helmreleases || true
                exit 1
              fi

              sleep 10
              elapsed=$((elapsed + 10))
            done
          }

          eso_diagnostics() {
            kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true
            kubectl -n flux-system describe kustomization addon-external-secrets || true
            kubectl -n flux-system describe ocirepository external-secrets || true
            kubectl -n flux-system describe helmrelease external-secrets || true
            kubectl -n external-secrets get pods -o wide || true
          }

          wait_for_helmrelease_ready() {
            local release_name="$1"
            local target_namespace="$2"
            local timeout_seconds="$3"
            local elapsed=0
            local ready
            local stalled

            while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
              ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
              stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"

              if [ "${ready}" = "True" ]; then
                return 0
              fi

              if [ "${stalled}" = "True" ]; then
                echo "HelmRelease ${release_name} is stalled" >&2
                kubectl -n flux-system describe "helmrelease/${release_name}" || true
                kubectl -n "${target_namespace}" get pods -o wide || true
                exit 1
              fi

              sleep 10
              elapsed=$((elapsed + 10))
            done

            echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
            kubectl -n flux-system describe "helmrelease/${release_name}" || true
            kubectl -n "${target_namespace}" get pods -o wide || true
            exit 1
          }

          wait_for_flux_oci_helm_release() {
            local oci_name="$1"
            local release_name="$2"
            local target_namespace="$3"
            local oci_timeout="$4"
            local release_timeout="$5"
            local reconcile_at

            wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
            reconcile_at="$(date +%s)"
            kubectl -n flux-system annotate "ocirepository/${oci_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
            kubectl -n flux-system annotate "helmrelease/${release_name}" \
              reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
              reconcile.fluxcd.io/resetAt="${reconcile_at}" \
              reconcile.fluxcd.io/forceAt="${reconcile_at}" \
              --overwrite

            if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
              eso_diagnostics
              exit 1
            fi

            wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
          }

          flux_helm_diagnostics() {
            local repo_name="$1"
            local chart_name="$2"
            local release_name="$3"
            local target_namespace="$4"

            kubectl -n flux-system get helmrepositories,helmcharts,helmreleases || true
            kubectl -n flux-system describe helmrepository "${repo_name}" || true
            kubectl -n flux-system describe helmchart.source.toolkit.fluxcd.io "${chart_name}" || true
            kubectl -n flux-system describe helmrelease "${release_name}" || true
            kubectl -n "${target_namespace}" get pods -o wide || true
          }

          wait_for_flux_helm_release() {
            local repo_name="$1"
            local chart_name="$2"
            local release_name="$3"
            local target_namespace="$4"
            local repo_timeout="$5"
            local chart_timeout="$6"
            local release_timeout="$7"
            local reconcile_at

            wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
            if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
              flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
              exit 1
            fi

            wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
            reconcile_at="$(date +%s)"
            kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
            kubectl -n flux-system annotate "helmrelease/${release_name}" \
              reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
              reconcile.fluxcd.io/resetAt="${reconcile_at}" \
              reconcile.fluxcd.io/forceAt="${reconcile_at}" \
              --overwrite

            if ! kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
              flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
              exit 1
            fi

            wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
          }

          kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
          ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
          kubectl -n flux-system create secret generic flux-system \
            --from-file=identity="$HOME/.ssh/id_ed25519" \
            --from-file=known_hosts=/tmp/flux_known_hosts \
            --dry-run=client -o yaml | kubectl apply -f -
          PRIMARY_CP_IP=$(python3 -c 'import json; print(json.load(open("outputs/terraform_outputs.json"))["primary_control_plane_ip"]["value"])')
          FLUX_IMAGE_PULL_ATTEMPTS=45
          FLUX_IMAGE_PULL_SLEEP=10
          for image in \
            ghcr.io/fluxcd/source-controller:v1.8.0 \
            ghcr.io/fluxcd/kustomize-controller:v1.8.1 \
            ghcr.io/fluxcd/helm-controller:v1.5.1 \
            ghcr.io/fluxcd/notification-controller:v1.8.1; do
            pulled=false
            for attempt in $(seq 1 "${FLUX_IMAGE_PULL_ATTEMPTS}"); do
              echo "Pre-pulling ${image} on ${PRIMARY_CP_IP} (${attempt}/${FLUX_IMAGE_PULL_ATTEMPTS})"
              if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${PRIMARY_CP_IP}" \
                "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || ((sudo k3s crictl pull --platform linux/amd64 '${image}' || sudo k3s crictl pull '${image}') && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
                pulled=true
                break
              fi
              sleep "${FLUX_IMAGE_PULL_SLEEP}"
            done
            if [ "${pulled}" != "true" ]; then
              echo "Failed to pre-pull required Flux image ${image} on ${PRIMARY_CP_IP}" >&2
              exit 1
            fi
          done
          # Apply CRDs and controllers first
          kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
          # Wait for CRDs to be established
          kubectl wait --for=condition=Established crd --all --timeout=120s
          # Then apply custom resources
          kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
          # Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
          PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
          kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
          kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
          kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
          kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
          kubectl -n flux-system delete pod --field-selector=status.phase!=Running || true
          flux_rollout_status source-controller
          flux_rollout_status kustomize-controller
          flux_rollout_status helm-controller
          kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
          # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
          wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
          kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
          wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
          wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
          wait_for_resource "" crd/externalsecrets.external-secrets.io 900
          kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
          kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
          kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets --timeout=600s
          kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets-webhook --timeout=600s
          wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
          wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
          kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
          # Create Doppler ClusterSecretStore now that ESO CRDs are available
          kubectl apply -f - <<'EOF'
          apiVersion: external-secrets.io/v1
          kind: ClusterSecretStore
          metadata:
            name: doppler-hetznerterra
          spec:
            provider:
              doppler:
                auth:
                  secretRef:
                    dopplerToken:
                      name: doppler-hetznerterra-service-token
                      key: dopplerToken
                      namespace: external-secrets
          EOF
          # Wait for the storage layer and private access components
          wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
          kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
          wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
          kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
          kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
          kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
          kubectl get storageclass flash-nfs

      - name: Wait for Rancher and backup operator
        env:
          KUBECONFIG: outputs/kubeconfig
        run: |
          set -euo pipefail
          wait_for_resource() {
            local namespace="$1"
            local resource="$2"
            local timeout_seconds="$3"
            local elapsed=0

            until {
              if [ -n "${namespace}" ]; then
                kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
              else
                kubectl get "${resource}" >/dev/null 2>&1
              fi
            }; do
              if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
                echo "Timed out waiting for ${resource} to exist" >&2
                kubectl -n flux-system get kustomizations,helmrepositories,helmcharts,helmreleases || true
                exit 1
              fi

              sleep 10
              elapsed=$((elapsed + 10))
            done
          }

          reconcile_helmrelease() {
            local release_name="$1"
            local reconcile_at
            reconcile_at="$(date +%s)"
            kubectl -n flux-system annotate "helmrelease/${release_name}" \
              reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
              reconcile.fluxcd.io/resetAt="${reconcile_at}" \
              reconcile.fluxcd.io/forceAt="${reconcile_at}" \
              --overwrite
          }

          wait_for_helmrelease_ready() {
            local release_name="$1"
            local target_namespace="$2"
            local timeout_seconds="$3"
            local elapsed=0
            local ready
            local stalled

            while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
              ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
              stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"

              if [ "${ready}" = "True" ]; then
                return 0
              fi

              if [ "${stalled}" = "True" ]; then
                echo "HelmRelease ${release_name} is stalled" >&2
                kubectl -n flux-system describe "helmrelease/${release_name}" || true
                kubectl -n "${target_namespace}" get pods -o wide || true
                exit 1
              fi

              sleep 10
              elapsed=$((elapsed + 10))
            done

            echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
            kubectl -n flux-system describe "helmrelease/${release_name}" || true
            kubectl -n "${target_namespace}" get pods -o wide || true
            exit 1
          }

          echo "Waiting for Rancher..."
          wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
          kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
          wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
          reconcile_helmrelease rancher
          wait_for_helmrelease_ready rancher cattle-system 900
          wait_for_resource "" namespace/cattle-system 600
          wait_for_resource cattle-system deployment/cattle-system-rancher 600
          kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
          wait_for_resource cattle-system deployment/rancher-webhook 900
          kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
          wait_for_resource cattle-system issuer/cattle-system-rancher 900
          wait_for_resource cattle-system certificate/tls-rancher-ingress 900
          kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
          kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s

          echo "Waiting for rancher-backup operator..."
          wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-backup 600
          kubectl -n flux-system annotate kustomization/addon-rancher-backup reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
          wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup-crd 600
          wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
          reconcile_helmrelease rancher-backup-crd
          reconcile_helmrelease rancher-backup
          wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
          wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
          wait_for_resource "" namespace/cattle-resources-system 600
          kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s

      - name: Restore Rancher from latest B2 backup
        env:
          KUBECONFIG: outputs/kubeconfig
          B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
          B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
        run: |
          echo "Finding latest backup in B2..."

          CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
          AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
          API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
          AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
          BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
          import json,sys
          resp = json.load(sys.stdin)
          bid = resp.get('allowed', {}).get('bucketId')
          if bid:
            print(bid)
          else:
            print('')
          ")

          if [ -z "$BUCKET_ID" ]; then
            echo "Restricted B2 key - resolving bucket ID by name..."
            BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
              "${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
              | python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
          fi

          LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
            "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
            | python3 -c "
          import json,sys
          files = json.load(sys.stdin).get('files', [])
          tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
          if not tars:
            print('NONE')
          else:
            tars.sort()
            print(tars[-1])
          ")

          if [ "$LATEST" = "NONE" ]; then
            echo "No backups found in B2. Skipping restore."
            exit 0
          fi

          BACKUP_FILE=$(basename "$LATEST")
          echo "Latest backup: ${BACKUP_FILE}"

          echo "Creating Restore CR..."
          kubectl apply -f - <<EOF
          apiVersion: resources.cattle.io/v1
          kind: Restore
          metadata:
            name: restore-from-b2
            namespace: cattle-resources-system
          spec:
            backupFilename: ${BACKUP_FILE}
            storageLocation:
              s3:
                credentialSecretName: rancher-b2-creds
                credentialSecretNamespace: cattle-resources-system
                bucketName: HetznerTerra
                folder: rancher-backups
                endpoint: s3.us-east-005.backblazeb2.com
                region: us-east-005
          EOF

          echo "Waiting for restore to complete..."
          for i in $(seq 1 60); do
            STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
            MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
            echo "  Restore status: ${STATUS} - ${MESSAGE}"
            if [ "$STATUS" = "True" ]; then
              echo "Restore completed successfully!"
              exit 0
            fi
            sleep 10
          done
          echo "Restore did not complete within timeout. Continuing anyway."

      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
          set -euo pipefail
          ansible -i inventory.ini 'control_plane[0]' -m shell -a '
            set -euo pipefail
            kubectl get nodes -o wide
            kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
            kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
            kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
            kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
            kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
            ! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
            kubectl get pods -A --no-headers \
              | grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
              | grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
              | grep -Ev "^cattle-resources-system[[:space:]]+rancher-backup-patch-sa-" \
              | grep -Ev "^kube-system[[:space:]]+helm-install-" \
              | tee /tmp/unhealthy-pods || true
            test ! -s /tmp/unhealthy-pods
            kubectl -n kube-system get pods -o wide
            kubectl -n tailscale-system get pods -o wide
            kubectl -n external-secrets get pods -o wide
          ' -e ansible_shell_executable=/bin/bash
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"

      - name: Post-deploy tailnet smoke checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"

      - name: Upload Kubeconfig
        uses: actions/upload-artifact@v3
        with:
          name: kubeconfig
          path: outputs/kubeconfig