1092 lines
49 KiB
YAML
1092 lines
49 KiB
YAML
name: Deploy Cluster
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
TF_VERSION: "1.7.0"
|
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
|
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
|
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
|
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
|
TF_VAR_proxmox_insecure: "true"
|
|
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
|
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
|
|
|
jobs:
|
|
terraform:
|
|
name: Terraform
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Terraform Format Check
|
|
working-directory: terraform
|
|
run: terraform fmt -check -recursive
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Terraform Validate
|
|
working-directory: terraform
|
|
run: terraform validate
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Plan
|
|
id: plan
|
|
working-directory: terraform
|
|
run: |
|
|
terraform plan \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-out=tfplan \
|
|
-no-color
|
|
continue-on-error: true
|
|
|
|
- name: Post Plan to PR
|
|
if: github.event_name == 'pull_request'
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const output = `#### Terraform Plan
|
|
\`\`\`
|
|
${{ steps.plan.outputs.stdout }}
|
|
\`\`\``;
|
|
github.rest.issues.createComment({
|
|
issue_number: context.issue.number,
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
body: output
|
|
});
|
|
|
|
- name: Fail if plan failed
|
|
if: steps.plan.outcome == 'failure'
|
|
run: exit 1
|
|
|
|
- name: Cleanup orphan Proxmox cloud-init volumes
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
run: |
|
|
set -euo pipefail
|
|
python3 - <<'PY'
|
|
import os
|
|
import ssl
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
|
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
|
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
|
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
|
node = "flex"
|
|
storage = "Flash"
|
|
vm_ids = [200, 201, 202, 210, 211, 212, 213, 214]
|
|
context = ssl._create_unverified_context() if insecure else None
|
|
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
|
|
|
def request(method, path):
|
|
req = urllib.request.Request(
|
|
f"{endpoint}/api2/json{path}",
|
|
method=method,
|
|
headers=headers,
|
|
)
|
|
return urllib.request.urlopen(req, context=context, timeout=30)
|
|
|
|
def vm_exists(vmid):
|
|
try:
|
|
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current").close()
|
|
return True
|
|
except urllib.error.HTTPError as err:
|
|
if err.code == 404:
|
|
return False
|
|
if err.code == 500 and "conf' does not exist" in err.reason:
|
|
return False
|
|
raise
|
|
|
|
for vmid in vm_ids:
|
|
if vm_exists(vmid):
|
|
print(f"VM {vmid} exists; keeping cloud-init volume")
|
|
continue
|
|
|
|
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
|
try:
|
|
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}").close()
|
|
print(f"Deleted orphan cloud-init volume for VM {vmid}")
|
|
except urllib.error.HTTPError as err:
|
|
if err.code == 404:
|
|
print(f"No orphan cloud-init volume for VM {vmid}")
|
|
continue
|
|
raise
|
|
PY
|
|
|
|
- name: Terraform Apply
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
working-directory: terraform
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
run_apply() {
|
|
local log_file="$1"
|
|
terraform apply \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-auto-approve 2>&1 | tee "${log_file}"
|
|
return "${PIPESTATUS[0]}"
|
|
}
|
|
|
|
cleanup_untracked_target_vms() {
|
|
python3 - <<'PY'
|
|
import os
|
|
import ssl
|
|
import subprocess
|
|
import time
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
|
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
|
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
|
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
|
node = "flex"
|
|
storage = "Flash"
|
|
context = ssl._create_unverified_context() if insecure else None
|
|
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
|
targets = {
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
|
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
|
|
}
|
|
|
|
def request(method, path, data=None):
|
|
body = None
|
|
req_headers = dict(headers)
|
|
if data is not None:
|
|
encoded = urllib.parse.urlencode(data)
|
|
if method == "DELETE":
|
|
path = f"{path}?{encoded}"
|
|
else:
|
|
body = encoded.encode()
|
|
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
req = urllib.request.Request(
|
|
f"{endpoint}/api2/json{path}",
|
|
method=method,
|
|
headers=req_headers,
|
|
data=body,
|
|
)
|
|
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
|
|
return resp.read()
|
|
|
|
def vm_status(vmid):
|
|
try:
|
|
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
|
|
return True
|
|
except urllib.error.HTTPError as err:
|
|
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
|
return False
|
|
raise
|
|
|
|
def vm_config(vmid):
|
|
try:
|
|
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
|
|
except urllib.error.HTTPError as err:
|
|
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
|
return {}
|
|
raise
|
|
import json
|
|
return json.loads(raw).get("data", {})
|
|
|
|
def wait_absent(vmid):
|
|
for _ in range(60):
|
|
if not vm_status(vmid):
|
|
return
|
|
time.sleep(5)
|
|
raise RuntimeError(f"VM {vmid} still exists after delete")
|
|
|
|
state = set(
|
|
subprocess.run(
|
|
["terraform", "state", "list"],
|
|
check=False,
|
|
text=True,
|
|
stdout=subprocess.PIPE,
|
|
).stdout.splitlines()
|
|
)
|
|
|
|
for address, (vmid, expected_name) in targets.items():
|
|
if address in state:
|
|
continue
|
|
if not vm_status(vmid):
|
|
continue
|
|
|
|
config = vm_config(vmid)
|
|
actual_name = config.get("name")
|
|
if actual_name != expected_name:
|
|
raise RuntimeError(
|
|
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
|
|
)
|
|
|
|
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
|
|
try:
|
|
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
|
|
time.sleep(10)
|
|
except urllib.error.HTTPError as err:
|
|
if err.code not in (400, 500):
|
|
raise
|
|
|
|
request(
|
|
"DELETE",
|
|
f"/nodes/{node}/qemu/{vmid}",
|
|
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
|
)
|
|
wait_absent(vmid)
|
|
|
|
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
|
try:
|
|
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
|
|
except urllib.error.HTTPError as err:
|
|
if err.code != 404:
|
|
raise
|
|
PY
|
|
}
|
|
|
|
for attempt in 1 2 3; do
|
|
log_file="/tmp/terraform-apply-${attempt}.log"
|
|
if run_apply "${log_file}"; then
|
|
exit 0
|
|
fi
|
|
|
|
if [ "${attempt}" = "3" ]; then
|
|
exit 1
|
|
fi
|
|
|
|
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
|
|
exit 1
|
|
fi
|
|
|
|
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
|
|
cleanup_untracked_target_vms
|
|
sleep 20
|
|
done
|
|
|
|
- name: Save Terraform Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
run: |
|
|
mkdir -p outputs
|
|
terraform output -json > outputs/terraform_outputs.json
|
|
working-directory: terraform
|
|
|
|
- name: Upload Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: terraform-outputs
|
|
path: outputs/terraform_outputs.json
|
|
|
|
ansible:
|
|
name: Ansible
|
|
runs-on: ubuntu-latest
|
|
needs: terraform
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Get Terraform Outputs
|
|
working-directory: terraform
|
|
run: |
|
|
mkdir -p ../outputs
|
|
terraform output -json > ../outputs/terraform_outputs.json
|
|
|
|
- name: Install Python Dependencies
|
|
run: |
|
|
apt-get update && apt-get install -y python3-pip
|
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
|
|
|
- name: Install Ansible Collections
|
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
|
|
|
- name: Install skopeo
|
|
run: |
|
|
apt-get update
|
|
apt-get install -y skopeo
|
|
|
|
- name: Generate Ansible Inventory
|
|
working-directory: ansible
|
|
run: python3 generate_inventory.py
|
|
|
|
- name: Prepare kube-vip image archive
|
|
run: |
|
|
set -euo pipefail
|
|
mkdir -p outputs
|
|
for attempt in 1 2 3; do
|
|
if skopeo copy \
|
|
docker://ghcr.io/kube-vip/kube-vip:v1.1.2 \
|
|
docker-archive:outputs/kube-vip-bootstrap.tar:ghcr.io/kube-vip/kube-vip:v1.1.2; then
|
|
exit 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
echo "Failed to prepare kube-vip image archive on runner" >&2
|
|
exit 1
|
|
|
|
- name: Prepare bootstrap image archives
|
|
run: |
|
|
set -euo pipefail
|
|
archive_name() {
|
|
printf '%s' "$1" | tr '/:' '__'
|
|
}
|
|
|
|
prepare_image_archive() {
|
|
local image="$1"
|
|
local archive="outputs/bootstrap-image-archives/$(archive_name "${image}").tar"
|
|
|
|
mkdir -p outputs/bootstrap-image-archives
|
|
for attempt in 1 2 3; do
|
|
if skopeo copy "docker://${image}" "docker-archive:${archive}:${image}"; then
|
|
return 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
|
|
echo "Failed to prepare bootstrap image archive for ${image}" >&2
|
|
return 1
|
|
}
|
|
|
|
for image in \
|
|
ghcr.io/fluxcd/source-controller:v1.8.0 \
|
|
ghcr.io/fluxcd/kustomize-controller:v1.8.1 \
|
|
ghcr.io/fluxcd/helm-controller:v1.5.1 \
|
|
ghcr.io/fluxcd/notification-controller:v1.8.1 \
|
|
oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 \
|
|
ghcr.io/tailscale/k8s-operator:v1.96.5 \
|
|
ghcr.io/tailscale/tailscale:v1.96.5 \
|
|
registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 \
|
|
docker.io/grafana/loki:3.5.7 \
|
|
docker.io/kiwigrid/k8s-sidecar:1.30.10 \
|
|
docker.io/grafana/promtail:3.0.0 \
|
|
docker.io/rancher/mirrored-library-traefik:3.6.10 \
|
|
docker.io/grafana/grafana:11.4.0 \
|
|
quay.io/prometheus-operator/prometheus-operator:v0.79.2 \
|
|
quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \
|
|
quay.io/prometheus/prometheus:v3.1.0 \
|
|
registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \
|
|
quay.io/prometheus/node-exporter:v1.8.2; do
|
|
prepare_image_archive "${image}"
|
|
done
|
|
|
|
- name: Run Ansible Playbook
|
|
working-directory: ansible
|
|
run: |
|
|
ansible-playbook site.yml \
|
|
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
|
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
|
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
|
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
|
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
|
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
|
|
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
|
-e "cluster_name=k8s-cluster"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
|
chmod +x /usr/local/bin/kubectl
|
|
|
|
- name: Rewrite kubeconfig for runner-reachable API
|
|
working-directory: terraform
|
|
run: |
|
|
set -euo pipefail
|
|
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
|
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
|
|
|
|
- name: Bootstrap Flux source and reconciliation graph
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
FLUX_GIT_HOST: 64.176.189.59
|
|
FLUX_GIT_PORT: "2222"
|
|
run: |
|
|
set -euo pipefail
|
|
flux_rollout_status() {
|
|
local deployment="$1"
|
|
if ! kubectl -n flux-system rollout status "deployment/${deployment}" --timeout=900s; then
|
|
kubectl -n flux-system get pods -o wide
|
|
kubectl -n flux-system describe deployment "${deployment}"
|
|
kubectl -n flux-system describe pods -l "app=${deployment}"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
wait_for_resource() {
|
|
local namespace="$1"
|
|
local resource="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
|
|
until {
|
|
if [ -n "${namespace}" ]; then
|
|
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
|
|
else
|
|
kubectl get "${resource}" >/dev/null 2>&1
|
|
fi
|
|
}; do
|
|
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
|
echo "Timed out waiting for ${resource} to exist" >&2
|
|
kubectl -n flux-system get kustomizations,helmreleases || true
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
}
|
|
|
|
pull_required_image() {
|
|
local image="$1"
|
|
local host_ip="$2"
|
|
local attempts="$3"
|
|
local sleep_seconds="$4"
|
|
local failure_message="$5"
|
|
local pulled=false
|
|
|
|
for attempt in $(seq 1 "${attempts}"); do
|
|
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
|
|
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
|
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
|
|
pulled=true
|
|
break
|
|
fi
|
|
sleep "${sleep_seconds}"
|
|
done
|
|
|
|
if [ "${pulled}" != "true" ]; then
|
|
echo "${failure_message} ${image} on ${host_ip}" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
import_required_image() {
|
|
local image="$1"
|
|
local host_ip="$2"
|
|
local archive_name
|
|
local archive_path
|
|
archive_name="$(printf '%s' "${image}" | tr '/:' '__').tar"
|
|
archive_path="outputs/bootstrap-image-archives/${archive_name}"
|
|
|
|
if [ ! -s "${archive_path}" ]; then
|
|
echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "Importing ${image} archive on ${host_ip}"
|
|
scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 \
|
|
"${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}"
|
|
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
|
"set -euo pipefail; \
|
|
if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \
|
|
for attempt in 1 2 3 4 5; do \
|
|
echo 'Importing ${image} archive with ctr'; \
|
|
if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \
|
|
sleep 10; \
|
|
done; \
|
|
sudo systemctl status k3s --no-pager -l || true; \
|
|
sudo journalctl -u k3s -n 80 --no-pager || true; \
|
|
exit 1"
|
|
}
|
|
|
|
import_required_image_on_all_nodes() {
|
|
local image="$1"
|
|
local host_ip
|
|
|
|
for host_ip in ${ALL_NODE_IPS}; do
|
|
import_required_image "${image}" "${host_ip}"
|
|
done
|
|
}
|
|
|
|
eso_diagnostics() {
|
|
kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true
|
|
kubectl -n flux-system describe kustomization addon-external-secrets || true
|
|
kubectl -n flux-system describe ocirepository external-secrets || true
|
|
kubectl -n flux-system describe helmrelease external-secrets || true
|
|
kubectl -n external-secrets get pods -o wide || true
|
|
}
|
|
|
|
wait_for_helmrelease_ready() {
|
|
local release_name="$1"
|
|
local target_namespace="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
local ready
|
|
local stalled
|
|
|
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
|
|
|
if [ "${ready}" = "True" ]; then
|
|
return 0
|
|
fi
|
|
|
|
if [ "${stalled}" = "True" ]; then
|
|
echo "HelmRelease ${release_name} is stalled" >&2
|
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
|
|
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
|
exit 1
|
|
}
|
|
|
|
wait_for_flux_oci_helm_release() {
|
|
local oci_name="$1"
|
|
local release_name="$2"
|
|
local target_namespace="$3"
|
|
local oci_timeout="$4"
|
|
local release_timeout="$5"
|
|
local reconcile_at
|
|
|
|
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
|
|
reconcile_at="$(date +%s)"
|
|
kubectl -n flux-system annotate "ocirepository/${oci_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
--overwrite
|
|
|
|
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
|
|
eso_diagnostics
|
|
exit 1
|
|
fi
|
|
|
|
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
|
}
|
|
|
|
flux_helm_diagnostics() {
|
|
local repo_name="$1"
|
|
local chart_name="$2"
|
|
local release_name="$3"
|
|
local target_namespace="$4"
|
|
|
|
kubectl -n flux-system get helmrepositories,helmcharts,helmreleases || true
|
|
kubectl -n flux-system describe helmrepository "${repo_name}" || true
|
|
kubectl -n flux-system describe helmchart.source.toolkit.fluxcd.io "${chart_name}" || true
|
|
kubectl -n flux-system describe helmrelease "${release_name}" || true
|
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
|
}
|
|
|
|
wait_for_flux_helm_release() {
|
|
local repo_name="$1"
|
|
local chart_name="$2"
|
|
local release_name="$3"
|
|
local target_namespace="$4"
|
|
local repo_timeout="$5"
|
|
local chart_timeout="$6"
|
|
local release_timeout="$7"
|
|
local reconcile_at
|
|
|
|
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
|
|
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
|
|
echo "HelmRepository ${repo_name} is not currently Ready; continuing because a cached artifact may still satisfy HelmChart ${chart_name}" >&2
|
|
kubectl -n flux-system describe helmrepository "${repo_name}" || true
|
|
fi
|
|
|
|
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
|
reconcile_at="$(date +%s)"
|
|
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
--overwrite
|
|
|
|
for attempt in $(seq 1 6); do
|
|
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
|
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
|
return 0
|
|
fi
|
|
|
|
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
|
|
reconcile_at="$(date +%s)"
|
|
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
done
|
|
|
|
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
|
exit 1
|
|
}
|
|
|
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
|
kubectl -n flux-system create secret generic flux-system \
|
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
|
--dry-run=client -o yaml | kubectl apply -f -
|
|
PRIMARY_CP_IP=$(python3 -c 'import json; print(json.load(open("outputs/terraform_outputs.json"))["primary_control_plane_ip"]["value"])')
|
|
ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))')
|
|
for image in \
|
|
ghcr.io/fluxcd/source-controller:v1.8.0 \
|
|
ghcr.io/fluxcd/kustomize-controller:v1.8.1 \
|
|
ghcr.io/fluxcd/helm-controller:v1.5.1 \
|
|
ghcr.io/fluxcd/notification-controller:v1.8.1; do
|
|
import_required_image "${image}" "${PRIMARY_CP_IP}"
|
|
done
|
|
for image in \
|
|
docker.io/grafana/loki:3.5.7 \
|
|
docker.io/kiwigrid/k8s-sidecar:1.30.10 \
|
|
docker.io/grafana/promtail:3.0.0 \
|
|
docker.io/rancher/mirrored-library-traefik:3.6.10 \
|
|
docker.io/grafana/grafana:11.4.0 \
|
|
quay.io/prometheus-operator/prometheus-operator:v0.79.2 \
|
|
quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \
|
|
quay.io/prometheus/prometheus:v3.1.0 \
|
|
registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \
|
|
quay.io/prometheus/node-exporter:v1.8.2; do
|
|
import_required_image_on_all_nodes "${image}"
|
|
done
|
|
# Apply CRDs and controllers first
|
|
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
|
# Wait for CRDs to be established
|
|
kubectl wait --for=condition=Established crd --all --timeout=120s
|
|
# Then apply custom resources
|
|
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
|
|
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
|
|
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
|
|
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system delete pod --field-selector=status.phase!=Running || true
|
|
flux_rollout_status source-controller
|
|
flux_rollout_status kustomize-controller
|
|
flux_rollout_status helm-controller
|
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
|
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
|
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
|
import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}"
|
|
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
|
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
|
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
|
|
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
|
|
kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
|
|
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets --timeout=600s
|
|
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets-webhook --timeout=600s
|
|
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
|
|
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
|
|
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
|
|
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
|
kubectl apply -f - <<'EOF'
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ClusterSecretStore
|
|
metadata:
|
|
name: doppler-hetznerterra
|
|
spec:
|
|
provider:
|
|
doppler:
|
|
auth:
|
|
secretRef:
|
|
dopplerToken:
|
|
name: doppler-hetznerterra-service-token
|
|
key: dopplerToken
|
|
namespace: external-secrets
|
|
EOF
|
|
# Wait for the storage layer and private access components
|
|
import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}"
|
|
import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}"
|
|
kubectl -n flux-system annotate kustomization/addon-tailscale-operator reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=600s
|
|
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
|
import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}"
|
|
kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
|
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
|
|
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
|
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
|
kubectl get storageclass flash-nfs
|
|
|
|
- name: Wait for Rancher and backup operator
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
wait_for_resource() {
|
|
local namespace="$1"
|
|
local resource="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
|
|
until {
|
|
if [ -n "${namespace}" ]; then
|
|
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
|
|
else
|
|
kubectl get "${resource}" >/dev/null 2>&1
|
|
fi
|
|
}; do
|
|
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
|
echo "Timed out waiting for ${resource} to exist" >&2
|
|
kubectl -n flux-system get kustomizations,helmrepositories,helmcharts,helmreleases || true
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
}
|
|
|
|
reconcile_helmrelease() {
|
|
local release_name="$1"
|
|
local reconcile_at
|
|
reconcile_at="$(date +%s)"
|
|
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
--overwrite
|
|
}
|
|
|
|
wait_for_helmchart_ready() {
|
|
local chart_name="$1"
|
|
local release_name="$2"
|
|
local timeout="$3"
|
|
local attempts="$4"
|
|
local reconcile_at
|
|
|
|
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
|
for attempt in $(seq 1 "${attempts}"); do
|
|
reconcile_at="$(date +%s)"
|
|
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
|
|
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then
|
|
return 0
|
|
fi
|
|
|
|
echo "HelmChart ${chart_name} did not become Ready after ${timeout}; forcing retry (${attempt}/${attempts})" >&2
|
|
done
|
|
|
|
kubectl -n flux-system describe "helmchart.source.toolkit.fluxcd.io/${chart_name}" || true
|
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
|
exit 1
|
|
}
|
|
|
|
wait_for_helmrelease_ready() {
|
|
local release_name="$1"
|
|
local target_namespace="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
local ready
|
|
local stalled
|
|
|
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
|
|
|
if [ "${ready}" = "True" ]; then
|
|
return 0
|
|
fi
|
|
|
|
if [ "${stalled}" = "True" ]; then
|
|
echo "HelmRelease ${release_name} is stalled" >&2
|
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
|
|
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
|
exit 1
|
|
}
|
|
|
|
pull_image_on_matching_pod_nodes() {
|
|
local namespace="$1"
|
|
local selector="$2"
|
|
local image="$3"
|
|
local attempts="$4"
|
|
local sleep_seconds="$5"
|
|
local nodes
|
|
|
|
nodes="$(kubectl -n "${namespace}" get pods -l "${selector}" -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' 2>/dev/null | sort -u)"
|
|
if [ -z "${nodes}" ]; then
|
|
echo "No pods found for ${namespace}/${selector}; skipping targeted image pull for ${image}" >&2
|
|
return 0
|
|
fi
|
|
|
|
for node in ${nodes}; do
|
|
local node_ip
|
|
local pulled=false
|
|
node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
|
|
|
|
for attempt in $(seq 1 "${attempts}"); do
|
|
echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})"
|
|
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \
|
|
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
|
|
pulled=true
|
|
break
|
|
fi
|
|
sleep "${sleep_seconds}"
|
|
done
|
|
|
|
if [ "${pulled}" != "true" ]; then
|
|
echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2
|
|
fi
|
|
done
|
|
}
|
|
|
|
echo "Waiting for Rancher..."
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
|
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
|
reconcile_helmrelease rancher
|
|
wait_for_helmchart_ready flux-system-rancher rancher 180s 5
|
|
wait_for_helmrelease_ready rancher cattle-system 900
|
|
wait_for_resource "" namespace/cattle-system 600
|
|
wait_for_resource cattle-system deployment/cattle-system-rancher 600
|
|
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
|
wait_for_resource cattle-system deployment/rancher-webhook 900
|
|
pull_image_on_matching_pod_nodes cattle-system app=rancher-webhook registry.rancher.com/rancher/rancher-webhook:v0.9.3 12 10
|
|
kubectl -n cattle-system rollout restart deployment/rancher-webhook
|
|
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
|
wait_for_resource cattle-system issuer/cattle-system-rancher 900
|
|
wait_for_resource cattle-system certificate/tls-rancher-ingress 900
|
|
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
|
|
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
|
|
|
|
echo "Waiting for rancher-backup operator..."
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-backup 600
|
|
kubectl -n flux-system annotate kustomization/addon-rancher-backup reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup-crd 600
|
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
|
|
reconcile_helmrelease rancher-backup-crd
|
|
reconcile_helmrelease rancher-backup
|
|
wait_for_helmchart_ready flux-system-rancher-backup-crd rancher-backup-crd 180s 5
|
|
wait_for_helmchart_ready flux-system-rancher-backup rancher-backup 180s 5
|
|
wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
|
|
wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
|
|
wait_for_resource "" namespace/cattle-resources-system 600
|
|
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
|
|
|
- name: Restore Rancher from latest B2 backup
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
|
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
|
run: |
|
|
echo "Finding latest backup in B2..."
|
|
|
|
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
|
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
|
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
|
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
|
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
|
|
import json,sys
|
|
resp = json.load(sys.stdin)
|
|
bid = resp.get('allowed', {}).get('bucketId')
|
|
if bid:
|
|
print(bid)
|
|
else:
|
|
print('')
|
|
")
|
|
|
|
if [ -z "$BUCKET_ID" ]; then
|
|
echo "Restricted B2 key - resolving bucket ID by name..."
|
|
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
|
|
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
|
|
fi
|
|
|
|
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
|
| python3 -c "
|
|
import json,sys
|
|
files = json.load(sys.stdin).get('files', [])
|
|
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
|
if not tars:
|
|
print('NONE')
|
|
else:
|
|
tars.sort()
|
|
print(tars[-1])
|
|
")
|
|
|
|
if [ "$LATEST" = "NONE" ]; then
|
|
echo "No backups found in B2. Skipping restore."
|
|
exit 0
|
|
fi
|
|
|
|
BACKUP_FILE=$(basename "$LATEST")
|
|
echo "Latest backup: ${BACKUP_FILE}"
|
|
|
|
echo "Creating Restore CR..."
|
|
kubectl apply -f - <<EOF
|
|
apiVersion: resources.cattle.io/v1
|
|
kind: Restore
|
|
metadata:
|
|
name: restore-from-b2
|
|
namespace: cattle-resources-system
|
|
spec:
|
|
backupFilename: ${BACKUP_FILE}
|
|
storageLocation:
|
|
s3:
|
|
credentialSecretName: rancher-b2-creds
|
|
credentialSecretNamespace: cattle-resources-system
|
|
bucketName: HetznerTerra
|
|
folder: rancher-backups
|
|
endpoint: s3.us-east-005.backblazeb2.com
|
|
region: us-east-005
|
|
EOF
|
|
|
|
echo "Waiting for restore to complete..."
|
|
for i in $(seq 1 60); do
|
|
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
|
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
|
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
|
if [ "$STATUS" = "True" ]; then
|
|
echo "Restore completed successfully!"
|
|
exit 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
echo "Restore did not complete within timeout. Continuing anyway."
|
|
|
|
- name: Post-deploy cluster health checks
|
|
working-directory: ansible
|
|
run: |
|
|
set -euo pipefail
|
|
ansible -i inventory.ini 'control_plane[0]' -m shell -a '
|
|
set -euo pipefail
|
|
kubectl get nodes -o wide
|
|
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=300s
|
|
reconcile_at=$(date +%s)
|
|
for release in kube-prometheus-stack loki promtail; do
|
|
kubectl -n flux-system annotate "helmrelease/${release}" \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
--overwrite
|
|
done
|
|
kubectl -n flux-system annotate kustomization/addon-observability \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
--overwrite
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
|
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
|
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
|
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
|
|
! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
|
|
unhealthy_pods=$(mktemp)
|
|
kubectl get pods -A --no-headers \
|
|
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
|
|
| grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
|
|
| grep -Ev "^cattle-capi-system[[:space:]]+capi-controller-manager-" \
|
|
| grep -Ev "^cattle-turtles-system[[:space:]]+cluster-api-operator-resources-cleanup-" \
|
|
| grep -Ev "^cattle-resources-system[[:space:]]+rancher-backup-patch-sa-" \
|
|
| grep -Ev "^kube-system[[:space:]]+helm-install-" \
|
|
| tee "${unhealthy_pods}" || true
|
|
test ! -s "${unhealthy_pods}"
|
|
kubectl -n kube-system get pods -o wide
|
|
kubectl -n tailscale-system get pods -o wide
|
|
kubectl -n external-secrets get pods -o wide
|
|
' -e ansible_shell_executable=/bin/bash
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Post-deploy tailnet smoke checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Upload Kubeconfig
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: kubeconfig
|
|
path: outputs/kubeconfig
|