Compare commits
112 Commits
8d1f9f4944
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| ceefcc3b29 | |||
| 0d339b3163 | |||
| 30ccf13c82 | |||
| 75e3604f30 | |||
| e4235a6e58 | |||
| ea2d534171 | |||
| a1b9fe6aa6 | |||
| 33765657ec | |||
| b8f64fa952 | |||
| 569d741751 | |||
| 89e53d9ec9 | |||
| 5a2551f40a | |||
| 8c7b62c024 | |||
| a1f07f863a | |||
| 2c3a49c2e0 | |||
| a7ce3dcc1a | |||
| 0ab9418458 | |||
| c251672618 | |||
| 89364e8f37 | |||
| 20d7a6f777 | |||
| 22ce5fd6f4 | |||
| afb1782d38 | |||
| 48870433bf | |||
| f2c506b350 | |||
| efdf13976a | |||
| 5269884408 | |||
| 6e5b0518be | |||
| 905d069e91 | |||
| 25ba4b7115 | |||
| 6a593fd559 | |||
| 936f54a1b5 | |||
| c9df11e65f | |||
| a3c238fda9 | |||
| a15fa50302 | |||
| 0f4f0b09fb | |||
| 4c002a870c | |||
| 43d11ac7e6 | |||
| 8c5edcf0a1 | |||
| a81da0d178 | |||
| 2a72527c79 | |||
| 7cb3b84ecb | |||
| d4930235fa | |||
| ee8dc4b451 | |||
| 144d40e7ac | |||
| cc14e32572 | |||
| a207a5a7fd | |||
| 4e1772c175 | |||
| ff70b12084 | |||
| a3963c56e6 | |||
| 612435c42c | |||
| ac42f671a2 | |||
| dbe7ec0468 | |||
| 816ac8b3c0 | |||
| 6f7998639f | |||
| 7a14f89ad1 | |||
| 786901c5d7 | |||
| 46f3d1130b | |||
| 2fe5a626d4 | |||
| 2ef68c8087 | |||
| e2cae18f5f | |||
| e0c1e41ee9 | |||
| 63533de901 | |||
| 1b39710f63 | |||
| 8c034323dc | |||
| 5fa2b411ee | |||
| 3ea28e525f | |||
| 4b95ba113d | |||
| 13627bf81f | |||
| ef3fb2489a | |||
| 7097495d72 | |||
| 9d601dc77c | |||
| f36445d99a | |||
| 89c2c99963 | |||
| 4a35cfb549 | |||
| 3d50bfc534 | |||
| ab2f287bfb | |||
| dcb2675b67 | |||
| b40bec7e0e | |||
| efe0c0cfd5 | |||
| c61d9f9c1d | |||
| 60ceac4624 | |||
| 47b384a337 | |||
| ecf17113fb | |||
| 4ffbcfa312 | |||
| 8745bcda47 | |||
| e47ec2a3e7 | |||
| 45c899d2bd | |||
| 0e52d8f159 | |||
| 4726db2b5b | |||
| 90d105e5ea | |||
| 952a80a742 | |||
| 4965017b86 | |||
| b2b9c38b91 | |||
| ff31cb4e74 | |||
| 8b4a445b37 | |||
| e447795395 | |||
| 31b82c9371 | |||
| cadfedacf1 | |||
| 561cd67b0c | |||
| 4eebbca648 | |||
| 7b5d794dfc | |||
| 8643bbfc12 | |||
| 84f446c2e6 | |||
| d446e86ece | |||
| 90c7f565e0 | |||
| 989848fa89 | |||
| 56e5807474 | |||
| df0511148c | |||
| 894e6275b1 | |||
| a01cf435d4 | |||
| 84f77c4a68 | |||
| 2e4196688c |
@@ -88,8 +88,11 @@ jobs:
|
||||
}
|
||||
|
||||
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
|
||||
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
|
||||
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
|
||||
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
|
||||
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
|
||||
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
|
||||
|
||||
- name: Terraform Plan
|
||||
id: plan
|
||||
@@ -227,6 +230,7 @@ jobs:
|
||||
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
||||
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
||||
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
||||
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
|
||||
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
||||
-e "cluster_name=k8s-cluster"
|
||||
env:
|
||||
@@ -237,6 +241,12 @@ jobs:
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
chmod +x /usr/local/bin/kubectl
|
||||
|
||||
- name: Install flux CLI
|
||||
run: |
|
||||
curl -fsSL https://github.com/fluxcd/flux2/releases/download/v2.5.1/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
|
||||
mv /tmp/flux /usr/local/bin/flux
|
||||
chmod +x /usr/local/bin/flux
|
||||
|
||||
- name: Rewrite kubeconfig for runner-reachable API
|
||||
working-directory: terraform
|
||||
run: |
|
||||
@@ -255,31 +265,150 @@ jobs:
|
||||
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
||||
--from-file=known_hosts=/tmp/flux_known_hosts \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
kubectl apply -k clusters/prod/flux-system
|
||||
# Apply CRDs and controllers first
|
||||
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
||||
# Wait for CRDs to be established
|
||||
kubectl wait --for=condition=Established crd --all --timeout=120s
|
||||
# Then apply custom resources
|
||||
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
|
||||
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
|
||||
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
|
||||
# Patch Flux controllers to run on cp-1 only
|
||||
kubectl -n flux-system patch deployment source-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
|
||||
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
|
||||
kubectl -n flux-system patch deployment helm-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
|
||||
kubectl -n flux-system patch deployment notification-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
|
||||
kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
|
||||
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
|
||||
kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
|
||||
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=300s
|
||||
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
||||
kubectl apply -f - <<'EOF'
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
metadata:
|
||||
name: doppler-hetznerterra
|
||||
spec:
|
||||
provider:
|
||||
doppler:
|
||||
auth:
|
||||
secretRef:
|
||||
dopplerToken:
|
||||
name: doppler-hetznerterra-service-token
|
||||
key: dopplerToken
|
||||
namespace: external-secrets
|
||||
EOF
|
||||
# Wait for CCM and CSI (Hetzner cloud integration)
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||
|
||||
- name: Wait for Rancher and backup operator
|
||||
env:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "Waiting for Rancher..."
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
|
||||
|
||||
echo "Waiting for rancher-backup operator..."
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
|
||||
|
||||
- name: Restore Rancher from latest B2 backup
|
||||
env:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
run: |
|
||||
echo "Finding latest backup in B2..."
|
||||
|
||||
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
||||
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
||||
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
||||
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
||||
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
|
||||
import json,sys
|
||||
resp = json.load(sys.stdin)
|
||||
bid = resp.get('allowed', {}).get('bucketId')
|
||||
if bid:
|
||||
print(bid)
|
||||
else:
|
||||
print('')
|
||||
")
|
||||
|
||||
if [ -z "$BUCKET_ID" ]; then
|
||||
echo "Restricted B2 key - resolving bucket ID by name..."
|
||||
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
||||
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
|
||||
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
|
||||
fi
|
||||
|
||||
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
||||
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
||||
| python3 -c "
|
||||
import json,sys
|
||||
files = json.load(sys.stdin).get('files', [])
|
||||
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
||||
if not tars:
|
||||
print('NONE')
|
||||
else:
|
||||
tars.sort()
|
||||
print(tars[-1])
|
||||
")
|
||||
|
||||
if [ "$LATEST" = "NONE" ]; then
|
||||
echo "No backups found in B2. Skipping restore."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
BACKUP_FILE=$(basename "$LATEST")
|
||||
echo "Latest backup: ${BACKUP_FILE}"
|
||||
|
||||
echo "Creating Restore CR..."
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: resources.cattle.io/v1
|
||||
kind: Restore
|
||||
metadata:
|
||||
name: restore-from-b2
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
backupFilename: ${BACKUP_FILE}
|
||||
storageLocation:
|
||||
s3:
|
||||
credentialSecretName: rancher-b2-creds
|
||||
credentialSecretNamespace: cattle-resources-system
|
||||
bucketName: HetznerTerra
|
||||
folder: rancher-backups
|
||||
endpoint: s3.us-east-005.backblazeb2.com
|
||||
region: us-east-005
|
||||
EOF
|
||||
|
||||
echo "Waiting for restore to complete..."
|
||||
for i in $(seq 1 60); do
|
||||
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
||||
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
||||
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
||||
if [ "$STATUS" = "True" ]; then
|
||||
echo "Restore completed successfully!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
echo "Restore did not complete within timeout. Continuing anyway."
|
||||
|
||||
- name: Post-deploy cluster health checks
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pods -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pvc"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability describe svc kube-prometheus-stack-grafana"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
|
||||
env:
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
|
||||
|
||||
@@ -16,13 +16,101 @@ env:
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
|
||||
jobs:
|
||||
pre-destroy-backup:
|
||||
name: Pre-Destroy Backup
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Get Control Plane IP
|
||||
id: cp_ip
|
||||
working-directory: terraform
|
||||
run: |
|
||||
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
||||
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Pre-Destroy pg_dump to B2
|
||||
run: |
|
||||
set +e
|
||||
echo "Attempting pre-destroy backup to B2..."
|
||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
|
||||
set -e
|
||||
# Check if kubectl is available and cluster is up
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "kubectl not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if we can reach the cluster
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo "Cannot reach cluster, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if CNP is deployed
|
||||
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
|
||||
echo "CNP namespace not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Run backup using the pgdump image directly
|
||||
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
|
||||
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
|
||||
|
||||
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
|
||||
echo "B2 credentials not found in secret, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
|
||||
-n cnpg-cluster --dry-run=client -o yaml | \
|
||||
kubectl apply -f -
|
||||
|
||||
echo "Waiting for backup job to complete..."
|
||||
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
|
||||
kubectl logs job/pgdump-manual -n cnpg-cluster || true
|
||||
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
|
||||
EOF
|
||||
echo "Pre-destroy backup step completed (failure is non-fatal)"
|
||||
|
||||
destroy:
|
||||
name: Destroy Cluster
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
needs: pre-destroy-backup
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
144
AGENTS.md
Normal file
144
AGENTS.md
Normal file
@@ -0,0 +1,144 @@
|
||||
# AGENTS.md
|
||||
|
||||
Repository guide for agentic contributors working in this repo.
|
||||
|
||||
## Scope
|
||||
|
||||
- Infrastructure repo for a Hetzner + k3s + Flux stack running Rancher.
|
||||
- Primary areas: `terraform/`, `ansible/`, `clusters/`, `infrastructure/`, `apps/`, `.gitea/workflows/`.
|
||||
- Treat `README.md` and `STABLE_BASELINE.md` as user-facing context, but prefer current manifests and workflows as source of truth.
|
||||
- Keep changes small and reviewable; prefer the narrowest file set that solves the task.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **Terraform** provisions Hetzner servers, network, firewall, load balancer, SSH keys.
|
||||
- **Ansible** bootstraps OS, installs k3s (with external cloud provider), deploys Hetzner CCM, Tailscale, Doppler token.
|
||||
- **Flux** reconciles all cluster addons from this repo after Ansible hands off.
|
||||
- **Rancher** stores state in embedded etcd (NOT an external DB). Backup/restore uses the `rancher-backup` operator to B2.
|
||||
- **cert-manager** is required — Tailscale LoadBalancer does L4 TCP passthrough, so Rancher serves its own TLS.
|
||||
- **Secrets flow**: Doppler → `ClusterSecretStore` (doppler-hetznerterra) → `ExternalSecret` resources → k8s Secrets.
|
||||
- Rancher is reachable only over Tailscale at `https://rancher.silverside-gopher.ts.net/`.
|
||||
- Grafana, Prometheus, and Flux UI are also exposed via dedicated Tailscale LoadBalancer services at `http://grafana.silverside-gopher.ts.net/`, `http://prometheus.silverside-gopher.ts.net/`, `http://flux.silverside-gopher.ts.net:9001/`.
|
||||
|
||||
## Important Files
|
||||
|
||||
- `terraform/main.tf` — provider and version pins
|
||||
- `terraform/variables.tf` — input surface and defaults
|
||||
- `terraform/firewall.tf` — firewall rules (tailnet CIDR, internal cluster ports)
|
||||
- `ansible/site.yml` — ordered bootstrap playbook (roles: common → k3s-server → ccm → k3s-agent → doppler → tailscale-cleanup)
|
||||
- `ansible/generate_inventory.py` — renders `ansible/inventory.ini` from Terraform outputs via Jinja2
|
||||
- `clusters/prod/flux-system/` — Flux GitRepository and top-level Kustomization resources
|
||||
- `infrastructure/addons/kustomization.yaml` — root addon graph with dependency ordering
|
||||
- `infrastructure/addons/<addon>/` — each addon is a self-contained dir with its own `kustomization.yaml`
|
||||
- `.gitea/workflows/deploy.yml` — canonical CI: terraform → ansible → flux bootstrap → B2 restore → health checks
|
||||
|
||||
## Build / Validate / Test
|
||||
|
||||
### Terraform
|
||||
|
||||
- Format: `terraform -chdir=terraform fmt -recursive`
|
||||
- Check formatting: `terraform -chdir=terraform fmt -check -recursive`
|
||||
- Validate: `terraform -chdir=terraform validate`
|
||||
- Plan (full): `terraform -chdir=terraform plan -var-file=../terraform.tfvars`
|
||||
- Plan one resource: `terraform -chdir=terraform plan -var-file=../terraform.tfvars -target=hcloud_server.control_plane[0]`
|
||||
- Apply: `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
|
||||
- State inspection: `terraform -chdir=terraform state list` / `terraform state show <address>`
|
||||
|
||||
### Ansible
|
||||
|
||||
- Install collections: `ansible-galaxy collection install -r ansible/requirements.yml`
|
||||
- Generate inventory: `cd ansible && python3 generate_inventory.py` (requires Terraform outputs)
|
||||
- Syntax check: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`
|
||||
- Dry-run one host: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --check --diff -l control_plane[0]`
|
||||
- Full bootstrap: `ansible-playbook ansible/site.yml`
|
||||
- Targeted: `ansible-playbook ansible/site.yml -t upgrade` or `-t reset`
|
||||
- Dashboards only: `ansible-playbook ansible/dashboards.yml`
|
||||
|
||||
### Python
|
||||
|
||||
- Syntax check: `python3 -m py_compile ansible/generate_inventory.py`
|
||||
- Run: `cd ansible && python3 generate_inventory.py`
|
||||
|
||||
### Kubernetes / Flux manifests
|
||||
|
||||
- Render single addon: `kubectl kustomize infrastructure/addons/<addon>`
|
||||
- Render cluster bootstrap: `kubectl kustomize clusters/prod/flux-system`
|
||||
- Validate only the directory you edited, not the whole repo.
|
||||
|
||||
### Kubeconfig refresh
|
||||
|
||||
- Preferred: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
|
||||
- Manual: `ssh -i ~/.ssh/infra root@<cp1-ip> "cat /etc/rancher/k3s/k3s.yaml" | sed 's/127.0.0.1/<cp1-ip>/g' > outputs/kubeconfig`
|
||||
|
||||
## Code Style
|
||||
|
||||
### General
|
||||
|
||||
- Match existing style in adjacent files. No new tools/frameworks unless the repo already uses them.
|
||||
- Prefer ASCII. Keep diffs minimal. No unrelated cleanup.
|
||||
- No comments unless the logic is non-obvious.
|
||||
|
||||
### Terraform / HCL
|
||||
|
||||
- 2-space indent. `terraform {}` block first, then providers, locals, variables, resources, outputs.
|
||||
- `snake_case` for variables, locals, resources. Descriptions on all variables/outputs.
|
||||
- `sensitive = true` on secrets. Run `terraform fmt` instead of hand-formatting.
|
||||
- Use `locals` for reused or non-trivial logic. Explicit `depends_on` only when required.
|
||||
|
||||
### Ansible / YAML
|
||||
|
||||
- 2-space YAML indent. Descriptive task names in sentence case.
|
||||
- Idempotent tasks: `changed_when: false` and `failed_when: false` for probes.
|
||||
- `command`/`shell` only when no dedicated module fits. `shell` only for pipes/redirection/heredocs.
|
||||
- `when` guards and `default(...)` filters over duplicated tasks.
|
||||
- Role names and filenames: kebab-case. Variables: snake_case.
|
||||
- Multi-line shell in workflows: `set -e` or `set -euo pipefail` for fail-fast.
|
||||
|
||||
### Kubernetes / Flux YAML
|
||||
|
||||
- One object per file. Kebab-case filenames matching repo patterns: `helmrelease-*.yaml`, `kustomization-*.yaml`, `*-externalsecret.yaml`.
|
||||
- Addon manifests live in `infrastructure/addons/<addon>/` with a `kustomization.yaml`.
|
||||
- Flux graph objects in `clusters/prod/flux-system/`.
|
||||
- Each addon gets a `kustomization-<addon>.yaml` entry in `infrastructure/addons/` with `dependsOn` for ordering.
|
||||
- Quote strings with `:`, `*`, cron expressions, or shell-sensitive chars.
|
||||
- Preserve existing labels/annotations unless the change specifically needs them.
|
||||
|
||||
### Python
|
||||
|
||||
- PEP 8. Imports ordered: stdlib, third-party, local. `snake_case` for functions/variables.
|
||||
- Scripts small and explicit. Exit non-zero on failure. Clear subprocess error handling.
|
||||
|
||||
## Known Issues & Workarounds
|
||||
|
||||
- **rancher-backup post-install job** (`rancher-backup-patch-sa`) uses a postRenderer in the HelmRelease to replace the broken `rancher/kuberlr-kubectl` image with `rancher/kubectl`. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead.
|
||||
- **B2 ExternalSecret** must use key names `accessKey` and `secretKey` (not `aws_access_key_id`/`aws_secret_access_key`).
|
||||
- **Stale Tailscale devices**: After cluster rebuild, delete stale offline `rancher` devices before booting. The `tailscale-cleanup` Ansible role handles this via the Tailscale API.
|
||||
- **Restricted B2 keys**: `b2_authorize_account` may return `allowed.bucketId: null`. CI falls back to `b2_list_buckets` to resolve bucket ID by name.
|
||||
|
||||
## Secrets / Security
|
||||
|
||||
- Never commit tokens, passwords, kubeconfigs, private keys, or generated secrets.
|
||||
- Runtime secrets via Gitea secrets (CI), Doppler, or External Secrets Operator.
|
||||
- `terraform.tfvars` and `outputs/` are gitignored. Never print secret values in logs or commits.
|
||||
|
||||
## CI Pipeline (`.gitea/workflows/deploy.yml`)
|
||||
|
||||
1. Terraform: fmt check → init → validate → import existing servers → plan → apply (main only)
|
||||
2. Ansible: install deps → generate inventory → run site.yml with extra vars (secrets injected from Gitea)
|
||||
3. Flux bootstrap: install kubectl/flux → rewrite kubeconfig → apply CRDs → apply graph → wait for addons
|
||||
4. Rancher wait: wait for Rancher and backup operator to be ready
|
||||
5. B2 restore: authorize B2 → find latest backup → create Restore CR → poll until ready
|
||||
6. Health checks: nodes, Flux objects, pods, storage class
|
||||
|
||||
## Editing Practices
|
||||
|
||||
- Read target file and adjacent patterns before editing.
|
||||
- Run the narrowest validation command after edits.
|
||||
- If you make a live-cluster workaround, also update the declarative manifests so Flux can own it.
|
||||
- Changes spanning Terraform + Ansible + Flux: update and verify each layer separately.
|
||||
- Check `git status` before and after changes.
|
||||
|
||||
## Cursor / Copilot Rules
|
||||
|
||||
- No `.cursor/rules/`, `.cursorrules`, or `.github/copilot-instructions.md` files exist.
|
||||
- If added later, mirror their guidance here and treat them as authoritative.
|
||||
20
README.md
20
README.md
@@ -11,7 +11,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
|
||||
| **Total Cost** | €28.93/mo |
|
||||
| **K8s** | k3s (latest, HA) |
|
||||
| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
|
||||
| **Access** | SSH/API restricted to Tailnet |
|
||||
| **Access** | SSH/API and Rancher UI restricted to Tailnet |
|
||||
| **Bootstrap** | Terraform + Ansible |
|
||||
|
||||
### Cluster Resources
|
||||
@@ -234,10 +234,17 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
|
||||
### Current addon status
|
||||
|
||||
- Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
|
||||
- Active Flux addons include `addon-ccm`, `addon-csi`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`, `addon-observability`, and `addon-observability-content`.
|
||||
- Active Flux addons for stable baseline: `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`.
|
||||
- Deferred addons: `addon-ccm`, `addon-csi`, `addon-observability`, `addon-observability-content` (to be added after baseline is stable).
|
||||
- Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons.
|
||||
- `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success.
|
||||
|
||||
### Rancher access
|
||||
|
||||
- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/dashboard/`.
|
||||
- The public Hetzner load balancer path is not used for Rancher.
|
||||
- Rancher uses the CNPG-backed PostgreSQL cluster in `cnpg-cluster`.
|
||||
|
||||
### Stable baseline acceptance
|
||||
|
||||
A rebuild is considered successful only when all of the following pass without manual intervention:
|
||||
@@ -245,12 +252,13 @@ A rebuild is considered successful only when all of the following pass without m
|
||||
- Terraform create succeeds for the default `1` control plane and `2` workers.
|
||||
- Ansible bootstrap succeeds end-to-end.
|
||||
- All nodes become `Ready`.
|
||||
- `hcloud-cloud-controller-manager` and `hcloud-csi` are `Ready`.
|
||||
- Required External Secrets sync successfully.
|
||||
- Tailscale private access works.
|
||||
- Grafana and Prometheus are reachable privately.
|
||||
- Flux core reconciliation is healthy.
|
||||
- External Secrets Operator is ready.
|
||||
- Tailscale operator is ready.
|
||||
- Terraform destroy succeeds cleanly or succeeds after workflow retries.
|
||||
|
||||
_Note: Observability stack (Grafana/Prometheus) is deferred and will be added once the core platform baseline is stable._
|
||||
|
||||
## Observability Stack
|
||||
|
||||
Flux deploys a lightweight observability stack in the `observability` namespace:
|
||||
|
||||
@@ -4,25 +4,32 @@ This document defines the current engineering target for this repository.
|
||||
|
||||
## Topology
|
||||
|
||||
- 1 control plane
|
||||
- 2 workers
|
||||
- 3 control planes (HA etcd cluster)
|
||||
- 3 workers
|
||||
- Hetzner Load Balancer for Kubernetes API
|
||||
- private Hetzner network
|
||||
- Tailscale operator access
|
||||
- Rancher UI exposed only through Tailscale (`rancher.silverside-gopher.ts.net`)
|
||||
|
||||
## In Scope
|
||||
|
||||
- Terraform infrastructure bootstrap
|
||||
- Ansible k3s bootstrap
|
||||
- Ansible k3s bootstrap with external cloud provider
|
||||
- **HA control plane (3 nodes with etcd quorum)**
|
||||
- **Hetzner Load Balancer for Kubernetes API**
|
||||
- **Hetzner CCM deployed via Ansible (before workers join)**
|
||||
- **Hetzner CSI for persistent volumes (via Flux)**
|
||||
- Flux core reconciliation
|
||||
- Hetzner CCM
|
||||
- Hetzner CSI
|
||||
- External Secrets Operator with Doppler
|
||||
- Tailscale private access
|
||||
- Observability stack
|
||||
- Persistent volume provisioning validated
|
||||
|
||||
## Deferred for Later Phases
|
||||
|
||||
- Observability stack (deferred - complex helm release needs separate debugging)
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- HA control plane
|
||||
- public ingress or DNS
|
||||
- public TLS
|
||||
- app workloads
|
||||
@@ -31,17 +38,28 @@ This document defines the current engineering target for this repository.
|
||||
|
||||
## Phase Gates
|
||||
|
||||
1. Terraform apply completes for the default topology.
|
||||
2. k3s server bootstrap completes and kubeconfig works.
|
||||
3. Workers join and all nodes are Ready.
|
||||
4. Flux source and infrastructure reconciliation are healthy.
|
||||
5. CCM is Ready.
|
||||
6. CSI is Ready and a PVC can bind.
|
||||
7. External Secrets sync required secrets.
|
||||
8. Tailscale private access works.
|
||||
9. Observability is healthy and reachable privately.
|
||||
10. Terraform destroy succeeds cleanly or via workflow retry.
|
||||
1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
|
||||
2. Load Balancer is healthy with all 3 control plane targets.
|
||||
3. Primary control plane bootstraps with `--cluster-init`.
|
||||
4. Secondary control planes join via Load Balancer endpoint.
|
||||
5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
|
||||
6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
|
||||
7. etcd reports 3 healthy members.
|
||||
8. Flux source and infrastructure reconciliation are healthy.
|
||||
9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
|
||||
10. **PVC provisioning tested and working**.
|
||||
11. External Secrets sync required secrets.
|
||||
12. Tailscale private access works, including Rancher UI access.
|
||||
13. Terraform destroy succeeds cleanly or via workflow retry.
|
||||
|
||||
## Success Criteria
|
||||
|
||||
The baseline is considered stable only after two consecutive fresh rebuilds pass all phase gates with no manual fixes.
|
||||
✅ **ACHIEVED** - HA Cluster with CCM/CSI:
|
||||
- Build 1: Initial CCM/CSI deployment and validation (2026-03-23)
|
||||
- Build 2: Full destroy/rebuild cycle successful (2026-03-23)
|
||||
|
||||
🔄 **IN PROGRESS** - HA Control Plane Validation:
|
||||
- Build 3: Deploy 3-3 topology with Load Balancer
|
||||
- Build 4: Destroy/rebuild to validate HA configuration
|
||||
|
||||
Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes.
|
||||
|
||||
@@ -32,6 +32,7 @@ def main():
|
||||
worker_names = outputs["worker_names"]["value"]
|
||||
worker_ips = outputs["worker_ips"]["value"]
|
||||
worker_private_ips = outputs["worker_private_ips"]["value"]
|
||||
kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])
|
||||
|
||||
control_planes = [
|
||||
{
|
||||
@@ -59,6 +60,7 @@ def main():
|
||||
"control_planes": control_planes,
|
||||
"workers": workers,
|
||||
"private_key_file": outputs["ssh_private_key_path"]["value"],
|
||||
"kube_api_lb_ip": kube_api_lb_ip,
|
||||
}
|
||||
|
||||
env = Environment(loader=FileSystemLoader("."))
|
||||
|
||||
@@ -17,3 +17,4 @@ ansible_user=root
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
ansible_ssh_private_key_file={{ private_key_file }}
|
||||
k3s_version=latest
|
||||
kube_api_endpoint={{ kube_api_lb_ip }}
|
||||
|
||||
@@ -3,3 +3,5 @@ collections:
|
||||
version: ">=2.4.0"
|
||||
- name: community.general
|
||||
version: ">=8.0.0"
|
||||
- name: community.network
|
||||
version: ">=5.0.0"
|
||||
|
||||
82
ansible/roles/ccm-deploy/tasks/main.yml
Normal file
82
ansible/roles/ccm-deploy/tasks/main.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
---
|
||||
- name: Check if hcloud secret exists
|
||||
command: kubectl -n kube-system get secret hcloud
|
||||
register: hcloud_secret_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail if hcloud secret is missing
|
||||
fail:
|
||||
msg: "hcloud secret not found in kube-system namespace. CCM requires it."
|
||||
when: hcloud_secret_check.rc != 0
|
||||
|
||||
- name: Check if helm is installed
|
||||
command: which helm
|
||||
register: helm_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Install helm
|
||||
when: helm_check.rc != 0
|
||||
block:
|
||||
- name: Download helm install script
|
||||
get_url:
|
||||
url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
|
||||
dest: /tmp/get-helm-3.sh
|
||||
mode: "0755"
|
||||
|
||||
- name: Run helm install script
|
||||
command: /tmp/get-helm-3.sh
|
||||
args:
|
||||
creates: /usr/local/bin/helm
|
||||
|
||||
- name: Add Hetzner Helm repository
|
||||
kubernetes.core.helm_repository:
|
||||
name: hcloud
|
||||
repo_url: https://charts.hetzner.cloud
|
||||
kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
environment:
|
||||
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
|
||||
|
||||
- name: Deploy Hetzner Cloud Controller Manager
|
||||
kubernetes.core.helm:
|
||||
name: hcloud-cloud-controller-manager
|
||||
chart_ref: hcloud/hcloud-cloud-controller-manager
|
||||
release_namespace: kube-system
|
||||
create_namespace: true
|
||||
values:
|
||||
networking:
|
||||
enabled: true
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: "{{ inventory_hostname }}"
|
||||
additionalTolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
wait: true
|
||||
wait_timeout: 300s
|
||||
environment:
|
||||
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
|
||||
|
||||
- name: Wait for CCM to be ready
|
||||
command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
|
||||
changed_when: false
|
||||
register: ccm_rollout
|
||||
until: ccm_rollout.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: Pause to ensure CCM is fully ready to process new nodes
|
||||
pause:
|
||||
seconds: 10
|
||||
|
||||
- name: Verify CCM is removing uninitialized taints
|
||||
command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
|
||||
register: uninitialized_taints
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display taint status
|
||||
debug:
|
||||
msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
|
||||
@@ -3,4 +3,4 @@ k3s_version: latest
|
||||
k3s_server_url: ""
|
||||
k3s_token: ""
|
||||
k3s_node_ip: ""
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_kubelet_cloud_provider_external: true
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
command: >-
|
||||
/tmp/install-k3s.sh agent
|
||||
--node-ip {{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
args:
|
||||
creates: /usr/local/bin/k3s-agent
|
||||
|
||||
@@ -5,4 +5,12 @@ k3s_node_ip: ""
|
||||
k3s_primary_public_ip: ""
|
||||
k3s_disable_embedded_ccm: true
|
||||
k3s_disable_servicelb: true
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_kubelet_cloud_provider_external: true
|
||||
# Load Balancer endpoint for HA cluster joins (set in inventory)
|
||||
kube_api_endpoint: ""
|
||||
# Tailscale DNS names for control planes (to enable tailnet access)
|
||||
# Using DNS names instead of IPs since Tailscale IPs change on rebuild
|
||||
tailscale_control_plane_names:
|
||||
- "k8s-cluster-cp-1.silverside-gopher.ts.net"
|
||||
- "k8s-cluster-cp-2.silverside-gopher.ts.net"
|
||||
- "k8s-cluster-cp-3.silverside-gopher.ts.net"
|
||||
|
||||
@@ -15,9 +15,9 @@
|
||||
set_fact:
|
||||
k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
|
||||
|
||||
- name: Wait for primary API on 6443 (secondary only)
|
||||
- name: Wait for API endpoint on 6443 (secondary only)
|
||||
wait_for:
|
||||
host: "{{ k3s_primary_ip }}"
|
||||
host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 120
|
||||
@@ -61,12 +61,15 @@
|
||||
--cluster-init
|
||||
--advertise-address={{ k3s_primary_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
--tls-san={{ k3s_primary_ip }}
|
||||
--tls-san={{ k3s_primary_public_ip }}
|
||||
--tls-san={{ kube_api_endpoint }}
|
||||
{% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
when:
|
||||
when:
|
||||
- k3s_install_needed
|
||||
- k3s_primary | default(false)
|
||||
|
||||
@@ -81,9 +84,10 @@
|
||||
K3S_TOKEN: "{{ k3s_token }}"
|
||||
command: >-
|
||||
/tmp/install-k3s.sh server
|
||||
--server https://{{ k3s_primary_ip }}:6443
|
||||
--server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
|
||||
--advertise-address={{ k3s_node_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
---
|
||||
- name: Create systemd unit for Grafana private access
|
||||
template:
|
||||
src: kubectl-port-forward.service.j2
|
||||
dest: /etc/systemd/system/k8s-portforward-grafana.service
|
||||
mode: "0644"
|
||||
vars:
|
||||
unit_description: Port-forward Grafana for Tailscale access
|
||||
unit_namespace: observability
|
||||
unit_target: svc/observability-kube-prometheus-stack-grafana
|
||||
unit_local_port: 13080
|
||||
unit_remote_port: 80
|
||||
|
||||
- name: Create systemd unit for Prometheus private access
|
||||
template:
|
||||
src: kubectl-port-forward.service.j2
|
||||
dest: /etc/systemd/system/k8s-portforward-prometheus.service
|
||||
mode: "0644"
|
||||
vars:
|
||||
unit_description: Port-forward Prometheus for Tailscale access
|
||||
unit_namespace: observability
|
||||
unit_target: svc/observability-kube-prometh-prometheus
|
||||
unit_local_port: 19090
|
||||
unit_remote_port: 9090
|
||||
|
||||
- name: Create systemd unit for Flux UI private access
|
||||
template:
|
||||
src: kubectl-port-forward.service.j2
|
||||
dest: /etc/systemd/system/k8s-portforward-flux-ui.service
|
||||
mode: "0644"
|
||||
vars:
|
||||
unit_description: Port-forward Flux UI for Tailscale access
|
||||
unit_namespace: flux-system
|
||||
unit_target: svc/flux-system-weave-gitops
|
||||
unit_local_port: 19001
|
||||
unit_remote_port: 9001
|
||||
|
||||
- name: Reload systemd
|
||||
systemd:
|
||||
daemon_reload: true
|
||||
|
||||
- name: Enable and start private access port-forward services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
enabled: true
|
||||
state: started
|
||||
loop:
|
||||
- k8s-portforward-grafana.service
|
||||
- k8s-portforward-prometheus.service
|
||||
- k8s-portforward-flux-ui.service
|
||||
|
||||
- name: Configure Tailscale Serve for private access endpoints
|
||||
shell: >-
|
||||
tailscale serve reset &&
|
||||
tailscale serve --bg --tcp={{ private_access_grafana_port }} tcp://127.0.0.1:13080 &&
|
||||
tailscale serve --bg --tcp={{ private_access_prometheus_port }} tcp://127.0.0.1:19090 &&
|
||||
tailscale serve --bg --tcp={{ private_access_flux_port }} tcp://127.0.0.1:19001
|
||||
changed_when: true
|
||||
@@ -1,13 +0,0 @@
|
||||
[Unit]
|
||||
Description={{ unit_description }}
|
||||
After=network-online.target k3s.service
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
ExecStart=/usr/local/bin/kubectl -n {{ unit_namespace }} port-forward --address 127.0.0.1 {{ unit_target }} {{ unit_local_port }}:{{ unit_remote_port }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
53
ansible/roles/tailscale-cleanup/tasks/main.yml
Normal file
53
ansible/roles/tailscale-cleanup/tasks/main.yml
Normal file
@@ -0,0 +1,53 @@
|
||||
---
|
||||
- name: Delete stale Tailscale devices with reserved hostnames
|
||||
block:
|
||||
- name: Get Tailscale devices from API
|
||||
uri:
|
||||
url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
|
||||
method: GET
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
return_content: true
|
||||
register: ts_devices
|
||||
|
||||
- name: Find stale devices matching reserved hostnames
|
||||
set_fact:
|
||||
stale_devices: >-
|
||||
{{ ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| rejectattr('online', 'defined')
|
||||
| list
|
||||
+
|
||||
ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('online', 'defined')
|
||||
| rejectattr('online', 'equalto', true)
|
||||
| list }}
|
||||
|
||||
- name: Delete stale devices
|
||||
uri:
|
||||
url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
|
||||
method: DELETE
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
status_code: 200
|
||||
loop: "{{ stale_devices }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }} ({{ item.id }})"
|
||||
when: stale_devices | length > 0
|
||||
|
||||
- name: Report cleaned devices
|
||||
debug:
|
||||
msg: "Deleted stale Tailscale device: {{ item.name }}"
|
||||
loop: "{{ stale_devices }}"
|
||||
when: stale_devices | length > 0
|
||||
|
||||
- name: No stale devices found
|
||||
debug:
|
||||
msg: "No stale Tailscale devices found."
|
||||
when: stale_devices | length == 0
|
||||
when:
|
||||
- tailscale_api_key is defined
|
||||
- tailscale_api_key | length > 0
|
||||
@@ -24,6 +24,7 @@
|
||||
k3s_primary_public_ip: "{{ ansible_host }}"
|
||||
k3s_primary_ip: "{{ k3s_private_ip }}"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
# kube_api_endpoint is set in inventory group_vars
|
||||
|
||||
roles:
|
||||
- k3s-server
|
||||
@@ -49,6 +50,20 @@
|
||||
dest: ../outputs/kubeconfig
|
||||
flat: true
|
||||
|
||||
- name: Bootstrap addon prerequisite secrets
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- addon-secrets-bootstrap
|
||||
|
||||
- name: Deploy Hetzner CCM (required for workers with external cloud provider)
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- ccm-deploy
|
||||
|
||||
- name: Setup secondary control planes
|
||||
hosts: control_plane[1:]
|
||||
become: true
|
||||
@@ -59,6 +74,8 @@
|
||||
k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
|
||||
k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
# Use Load Balancer for HA - all control planes join via LB endpoint
|
||||
k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
|
||||
|
||||
roles:
|
||||
- k3s-server
|
||||
@@ -69,19 +86,13 @@
|
||||
|
||||
vars:
|
||||
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
|
||||
k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
|
||||
# Use Load Balancer for HA - workers join via LB endpoint
|
||||
k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
|
||||
roles:
|
||||
- k3s-agent
|
||||
|
||||
- name: Bootstrap addon prerequisite secrets
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- addon-secrets-bootstrap
|
||||
|
||||
- name: Deploy observability stack
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
@@ -98,17 +109,6 @@
|
||||
- role: observability-content
|
||||
when: not (observability_gitops_enabled | default(true) | bool)
|
||||
|
||||
- name: Configure private tailnet access
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
vars:
|
||||
private_access_grafana_port: 30080
|
||||
private_access_prometheus_port: 30990
|
||||
private_access_flux_port: 30901
|
||||
|
||||
roles:
|
||||
- private-access
|
||||
|
||||
- name: Bootstrap Doppler access for External Secrets
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
@@ -116,13 +116,23 @@
|
||||
roles:
|
||||
- doppler-bootstrap
|
||||
|
||||
- name: Clean up stale Tailscale devices
|
||||
hosts: localhost
|
||||
connection: local
|
||||
vars:
|
||||
tailscale_reserved_hostnames:
|
||||
- rancher
|
||||
|
||||
roles:
|
||||
- tailscale-cleanup
|
||||
|
||||
- name: Finalize
|
||||
hosts: localhost
|
||||
connection: local
|
||||
tasks:
|
||||
- name: Update kubeconfig server address
|
||||
command: |
|
||||
sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
|
||||
sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
|
||||
changed_when: true
|
||||
|
||||
- name: Display success message
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: cert-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: cert-manager
|
||||
version: "v1.17.2"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: jetstack
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
crds:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: jetstack
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.jetstack.io
|
||||
6
infrastructure/addons/cert-manager/kustomization.yaml
Normal file
6
infrastructure/addons/cert-manager/kustomization.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-cert-manager.yaml
|
||||
- helmrelease-cert-manager.yaml
|
||||
6
infrastructure/addons/cert-manager/namespace.yaml
Normal file
6
infrastructure/addons/cert-manager/namespace.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
labels:
|
||||
kustomize.toolkit.fluxcd.io/prune: disabled
|
||||
19
infrastructure/addons/flux-ui/flux-tailscale-service.yaml
Normal file
19
infrastructure/addons/flux-ui/flux-tailscale-service.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: flux-tailscale
|
||||
namespace: flux-system
|
||||
annotations:
|
||||
tailscale.com/hostname: flux
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: weave-gitops
|
||||
app.kubernetes.io/instance: flux-system-weave-gitops
|
||||
ports:
|
||||
- name: http
|
||||
port: 9001
|
||||
protocol: TCP
|
||||
targetPort: http
|
||||
@@ -27,9 +27,12 @@ spec:
|
||||
adminUser:
|
||||
create: true
|
||||
createClusterRole: true
|
||||
createSecret: false
|
||||
createSecret: false # Secret is managed by External Secret from Doppler
|
||||
username: admin
|
||||
rbac:
|
||||
create: true
|
||||
impersonationResourceNames:
|
||||
- admin
|
||||
viewSecretsResourceNames:
|
||||
- cluster-user-auth
|
||||
- oidc-auth
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: flux-ui
|
||||
namespace: flux-system
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: flux
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: flux-system-weave-gitops
|
||||
port:
|
||||
number: 9001
|
||||
@@ -4,5 +4,4 @@ resources:
|
||||
- cluster-user-auth-externalsecret.yaml
|
||||
- gitrepository-weave-gitops.yaml
|
||||
- helmrelease-weave-gitops.yaml
|
||||
- traefik-helmchartconfig-flux-entrypoint.yaml
|
||||
- ingress-flux-ui.yaml
|
||||
- flux-tailscale-service.yaml
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChartConfig
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
spec:
|
||||
valuesContent: |-
|
||||
additionalArguments:
|
||||
- "--entryPoints.flux.address=:9001/tcp"
|
||||
@@ -11,5 +11,5 @@ spec:
|
||||
name: platform
|
||||
path: ./infrastructure/addons/ccm
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
|
||||
15
infrastructure/addons/kustomization-cert-manager.yaml
Normal file
15
infrastructure/addons/kustomization-cert-manager.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-cert-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/cert-manager
|
||||
wait: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -13,5 +13,5 @@ spec:
|
||||
dependsOn:
|
||||
- name: addon-ccm
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
|
||||
@@ -12,6 +12,8 @@ spec:
|
||||
path: ./infrastructure/addons/flux-ui
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
|
||||
@@ -12,6 +12,8 @@ spec:
|
||||
path: ./infrastructure/addons/observability
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-backup-config
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-backup-config
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-rancher-backup
|
||||
18
infrastructure/addons/kustomization-rancher-backup.yaml
Normal file
18
infrastructure/addons/kustomization-rancher-backup.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-backup
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-backup
|
||||
wait: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
- name: addon-rancher
|
||||
17
infrastructure/addons/kustomization-rancher-config.yaml
Normal file
17
infrastructure/addons/kustomization-rancher-config.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-config
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-config
|
||||
dependsOn:
|
||||
- name: addon-rancher
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
20
infrastructure/addons/kustomization-rancher.yaml
Normal file
20
infrastructure/addons/kustomization-rancher.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher
|
||||
wait: true
|
||||
timeout: 15m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
- name: addon-external-secrets
|
||||
- name: addon-cert-manager
|
||||
@@ -12,4 +12,4 @@ spec:
|
||||
path: ./infrastructure/addons/tailscale-operator
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
suspend: false
|
||||
|
||||
@@ -14,4 +14,4 @@ spec:
|
||||
- name: addon-tailscale-operator
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
suspend: false
|
||||
|
||||
@@ -4,8 +4,14 @@ resources:
|
||||
- kustomization-ccm.yaml
|
||||
- kustomization-csi.yaml
|
||||
- kustomization-external-secrets.yaml
|
||||
- kustomization-flux-ui.yaml
|
||||
- kustomization-cert-manager.yaml
|
||||
- kustomization-tailscale-operator.yaml
|
||||
- kustomization-tailscale-proxyclass.yaml
|
||||
- traefik
|
||||
- kustomization-flux-ui.yaml
|
||||
- kustomization-observability.yaml
|
||||
- kustomization-observability-content.yaml
|
||||
- kustomization-rancher.yaml
|
||||
- kustomization-rancher-config.yaml
|
||||
- kustomization-rancher-backup.yaml
|
||||
- kustomization-rancher-backup-config.yaml
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: observability
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /grafana
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: observability-kube-prometheus-stack-grafana
|
||||
port:
|
||||
number: 80
|
||||
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana-tailscale
|
||||
namespace: observability
|
||||
annotations:
|
||||
tailscale.com/hostname: grafana
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: grafana
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 3000
|
||||
@@ -26,12 +26,10 @@ spec:
|
||||
enabled: true
|
||||
admin:
|
||||
existingSecret: grafana-admin-credentials
|
||||
userKey: admin-user
|
||||
passwordKey: admin-password
|
||||
grafana.ini:
|
||||
server:
|
||||
root_url: http://observability/grafana/
|
||||
serve_from_sub_path: true
|
||||
root_url: http://grafana.silverside-gopher.ts.net/
|
||||
serve_from_sub_path: false
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: local-path
|
||||
@@ -51,8 +49,8 @@ spec:
|
||||
service:
|
||||
type: ClusterIP
|
||||
prometheusSpec:
|
||||
externalUrl: http://observability/prometheus/
|
||||
routePrefix: /prometheus/
|
||||
externalUrl: http://prometheus.silverside-gopher.ts.net/
|
||||
routePrefix: /
|
||||
retention: 7d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
|
||||
@@ -3,11 +3,10 @@ kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- grafana-admin-externalsecret.yaml
|
||||
- traefik-tailscale-service.yaml
|
||||
- grafana-ingress.yaml
|
||||
- prometheus-ingress.yaml
|
||||
- helmrepository-prometheus-community.yaml
|
||||
- helmrepository-grafana.yaml
|
||||
- helmrelease-kube-prometheus-stack.yaml
|
||||
- helmrelease-loki.yaml
|
||||
- helmrelease-promtail.yaml
|
||||
- grafana-tailscale-service.yaml
|
||||
- prometheus-tailscale-service.yaml
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: observability
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /prometheus
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: observability-kube-prometh-prometheus
|
||||
port:
|
||||
number: 9090
|
||||
@@ -0,0 +1,19 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-tailscale
|
||||
namespace: observability
|
||||
annotations:
|
||||
tailscale.com/hostname: prometheus
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: prometheus
|
||||
operator.prometheus.io/name: observability-kube-prometh-prometheus
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
protocol: TCP
|
||||
targetPort: 9090
|
||||
@@ -1,27 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik-tailscale
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
tailscale.com/hostname: observability
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- name: web
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
- name: websecure
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: websecure
|
||||
- name: flux
|
||||
port: 9001
|
||||
protocol: TCP
|
||||
targetPort: 9001
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: resources.cattle.io/v1
|
||||
kind: Backup
|
||||
metadata:
|
||||
name: rancher-b2-recurring
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
resourceSetName: rancher-resource-set-full
|
||||
storageLocation:
|
||||
s3:
|
||||
credentialSecretName: rancher-b2-creds
|
||||
credentialSecretNamespace: cattle-resources-system
|
||||
bucketName: HetznerTerra
|
||||
folder: rancher-backups
|
||||
endpoint: s3.us-east-005.backblazeb2.com
|
||||
region: us-east-005
|
||||
schedule: "0 3 * * *"
|
||||
retentionCount: 7
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- backup-recurring.yaml
|
||||
- restore-from-b2.yaml
|
||||
@@ -0,0 +1,19 @@
|
||||
# Uncomment and set backupFilename to restore from a specific backup on rebuild.
|
||||
# Find the latest backup filename in B2: rancher-backups/ folder.
|
||||
# After restore succeeds, Rancher will have all users/settings from the backup.
|
||||
#
|
||||
# apiVersion: resources.cattle.io/v1
|
||||
# kind: Restore
|
||||
# metadata:
|
||||
# name: restore-from-b2
|
||||
# namespace: cattle-resources-system
|
||||
# spec:
|
||||
# backupFilename: rancher-b2-manual-test-0a416444-2c8a-4d34-8a07-d9e406750374-2026-03-30T00-08-02Z.tar.gz
|
||||
# storageLocation:
|
||||
# s3:
|
||||
# credentialSecretName: rancher-b2-creds
|
||||
# credentialSecretNamespace: cattle-resources-system
|
||||
# bucketName: HetznerTerra
|
||||
# folder: rancher-backups
|
||||
# endpoint: s3.us-east-005.backblazeb2.com
|
||||
# region: us-east-005
|
||||
@@ -0,0 +1,25 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: rancher-b2-creds
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: rancher-b2-creds
|
||||
creationPolicy: Owner
|
||||
template:
|
||||
type: Opaque
|
||||
data:
|
||||
accessKey: "{{ .B2_ACCOUNT_ID }}"
|
||||
secretKey: "{{ .B2_APPLICATION_KEY }}"
|
||||
data:
|
||||
- secretKey: B2_ACCOUNT_ID
|
||||
remoteRef:
|
||||
key: B2_ACCOUNT_ID
|
||||
- secretKey: B2_APPLICATION_KEY
|
||||
remoteRef:
|
||||
key: B2_APPLICATION_KEY
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: rancher-backup-crd
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cattle-resources-system
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher-backup-crd
|
||||
version: "106.0.2+up8.1.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
@@ -0,0 +1,42 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: rancher-backup
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cattle-resources-system
|
||||
dependsOn:
|
||||
- name: rancher-backup-crd
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher-backup
|
||||
version: "106.0.2+up8.1.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
image:
|
||||
repository: rancher/backup-restore-operator
|
||||
kubectl:
|
||||
image:
|
||||
repository: rancher/kubectl
|
||||
tag: "v1.34.0"
|
||||
postRenderers:
|
||||
- kustomize:
|
||||
patches:
|
||||
- target:
|
||||
kind: Job
|
||||
name: rancher-backup-patch-sa
|
||||
patch: |
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/image
|
||||
value: rancher/kubectl:v1.34.0
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.rancher.io
|
||||
8
infrastructure/addons/rancher-backup/kustomization.yaml
Normal file
8
infrastructure/addons/rancher-backup/kustomization.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-rancher-backup.yaml
|
||||
- helmrelease-rancher-backup-crd.yaml
|
||||
- helmrelease-rancher-backup.yaml
|
||||
- b2-credentials-externalsecret.yaml
|
||||
4
infrastructure/addons/rancher-backup/namespace.yaml
Normal file
4
infrastructure/addons/rancher-backup/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cattle-resources-system
|
||||
4
infrastructure/addons/rancher-config/kustomization.yaml
Normal file
4
infrastructure/addons/rancher-config/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- server-url-setting.yaml
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: management.cattle.io/v3
|
||||
kind: Setting
|
||||
metadata:
|
||||
name: server-url
|
||||
value: https://rancher.silverside-gopher.ts.net
|
||||
48
infrastructure/addons/rancher/helmrelease-rancher.yaml
Normal file
48
infrastructure/addons/rancher/helmrelease-rancher.yaml
Normal file
@@ -0,0 +1,48 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: rancher
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cattle-system
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher
|
||||
version: "2.13.3"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-stable
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
hostname: rancher.silverside-gopher.ts.net
|
||||
replicas: 1
|
||||
extraEnv:
|
||||
- name: CATTLE_PROMETHEUS_METRICS
|
||||
value: "true"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: DoesNotExist
|
||||
valuesFrom:
|
||||
- kind: Secret
|
||||
name: rancher-bootstrap-password
|
||||
valuesKey: bootstrapPassword
|
||||
targetPath: bootstrapPassword
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: rancher-stable
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://releases.rancher.com/server-charts/stable
|
||||
9
infrastructure/addons/rancher/kustomization.yaml
Normal file
9
infrastructure/addons/rancher/kustomization.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-rancher.yaml
|
||||
- helmrelease-rancher.yaml
|
||||
- rancher-bootstrap-password-flux-externalsecret.yaml
|
||||
- rancher-bootstrap-password-externalsecret.yaml
|
||||
- rancher-tailscale-service.yaml
|
||||
4
infrastructure/addons/rancher/namespace.yaml
Normal file
4
infrastructure/addons/rancher/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cattle-system
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: rancher-bootstrap-password
|
||||
namespace: cattle-system
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: rancher-bootstrap-password
|
||||
creationPolicy: Owner
|
||||
template:
|
||||
type: Opaque
|
||||
data:
|
||||
bootstrapPassword: "{{ .rancherBootstrapPassword }}"
|
||||
data:
|
||||
- secretKey: rancherBootstrapPassword
|
||||
remoteRef:
|
||||
key: RANCHER_BOOTSTRAP_PASSWORD
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: rancher-bootstrap-password
|
||||
namespace: flux-system
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: rancher-bootstrap-password
|
||||
creationPolicy: Owner
|
||||
template:
|
||||
type: Opaque
|
||||
data:
|
||||
bootstrapPassword: "{{ .RANCHER_BOOTSTRAP_PASSWORD }}"
|
||||
data:
|
||||
- secretKey: RANCHER_BOOTSTRAP_PASSWORD
|
||||
remoteRef:
|
||||
key: RANCHER_BOOTSTRAP_PASSWORD
|
||||
22
infrastructure/addons/rancher/rancher-tailscale-service.yaml
Normal file
22
infrastructure/addons/rancher/rancher-tailscale-service.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: rancher-tailscale
|
||||
namespace: cattle-system
|
||||
annotations:
|
||||
tailscale.com/hostname: rancher
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app: cattle-system-rancher
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 80
|
||||
- name: https
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
38
infrastructure/addons/traefik/helmrelease-traefik.yaml
Normal file
38
infrastructure/addons/traefik/helmrelease-traefik.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: kube-system
|
||||
chart:
|
||||
spec:
|
||||
chart: traefik
|
||||
version: "39.0.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: traefik
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
additionalArguments:
|
||||
- "--entryPoints.flux.address=:9001/tcp"
|
||||
- "--entryPoints.rancher.address=:9442/tcp"
|
||||
service:
|
||||
type: NodePort
|
||||
ports:
|
||||
web:
|
||||
nodePort: 31097
|
||||
websecure:
|
||||
nodePort: 30193
|
||||
rancher:
|
||||
port: 9442
|
||||
exposedPort: 9442
|
||||
protocol: TCP
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: https://traefik.github.io/charts
|
||||
provider: generic
|
||||
5
infrastructure/addons/traefik/kustomization.yaml
Normal file
5
infrastructure/addons/traefik/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrepository-traefik.yaml
|
||||
- helmrelease-traefik.yaml
|
||||
33
scripts/refresh-kubeconfig.sh
Executable file
33
scripts/refresh-kubeconfig.sh
Executable file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
KUBECONFIG_PATH="$REPO_ROOT/outputs/kubeconfig"
|
||||
SSH_KEY="${SSH_KEY:-$HOME/.ssh/infra}"
|
||||
|
||||
CP1_PUBLIC_IP="${1:-}"
|
||||
|
||||
if [ -z "$CP1_PUBLIC_IP" ]; then
|
||||
if [ -f "$REPO_ROOT/ansible/inventory.ini" ]; then
|
||||
CP1_PUBLIC_IP=$(grep -A2 '\[control_plane\]' "$REPO_ROOT/ansible/inventory.ini" | grep -oP '\d+\.\d+\.\d+\.\d+' | head -1)
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$CP1_PUBLIC_IP" ]; then
|
||||
echo "Usage: $0 <control-plane-1-public-ip>"
|
||||
echo " Or ensure ansible/inventory.ini exists with control plane IPs."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Fetching kubeconfig from $CP1_PUBLIC_IP ..."
|
||||
ssh -i "$SSH_KEY" \
|
||||
-o StrictHostKeyChecking=no \
|
||||
-o UserKnownHostsFile=/dev/null \
|
||||
"root@$CP1_PUBLIC_IP" "cat /etc/rancher/k3s/k3s.yaml" \
|
||||
| sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \
|
||||
> "$KUBECONFIG_PATH"
|
||||
|
||||
chmod 600 "$KUBECONFIG_PATH"
|
||||
echo "Kubeconfig saved to $KUBECONFIG_PATH"
|
||||
echo "Run: export KUBECONFIG=$KUBECONFIG_PATH"
|
||||
@@ -89,6 +89,22 @@ resource "hcloud_firewall" "cluster" {
|
||||
}
|
||||
}
|
||||
|
||||
rule {
|
||||
description = "HTTP from Load Balancer"
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "80"
|
||||
source_ips = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
rule {
|
||||
description = "HTTPS from Load Balancer"
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "443"
|
||||
source_ips = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
rule {
|
||||
description = "ICMP"
|
||||
direction = "in"
|
||||
|
||||
50
terraform/loadbalancer.tf
Normal file
50
terraform/loadbalancer.tf
Normal file
@@ -0,0 +1,50 @@
|
||||
# Load Balancer for Kubernetes API High Availability
|
||||
# Provides a single endpoint for all control planes
|
||||
|
||||
resource "hcloud_load_balancer" "kube_api" {
|
||||
name = "${var.cluster_name}-api"
|
||||
load_balancer_type = "lb11" # Cheapest tier: €5.39/month
|
||||
location = var.location
|
||||
|
||||
labels = {
|
||||
cluster = var.cluster_name
|
||||
role = "kube-api"
|
||||
}
|
||||
}
|
||||
|
||||
# Attach Load Balancer to private network (required for use_private_ip)
|
||||
resource "hcloud_load_balancer_network" "kube_api" {
|
||||
load_balancer_id = hcloud_load_balancer.kube_api.id
|
||||
network_id = hcloud_network.cluster.id
|
||||
ip = cidrhost(var.subnet_cidr, 5) # 10.0.1.5
|
||||
}
|
||||
|
||||
# Attach all control plane servers as targets
|
||||
resource "hcloud_load_balancer_target" "kube_api_targets" {
|
||||
count = var.control_plane_count
|
||||
type = "server"
|
||||
load_balancer_id = hcloud_load_balancer.kube_api.id
|
||||
server_id = hcloud_server.control_plane[count.index].id
|
||||
use_private_ip = true
|
||||
|
||||
depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane]
|
||||
}
|
||||
|
||||
# Kubernetes API service on port 6443
|
||||
resource "hcloud_load_balancer_service" "kube_api" {
|
||||
load_balancer_id = hcloud_load_balancer.kube_api.id
|
||||
protocol = "tcp"
|
||||
listen_port = 6443
|
||||
destination_port = 6443
|
||||
|
||||
health_check {
|
||||
protocol = "tcp"
|
||||
port = 6443
|
||||
interval = 15
|
||||
timeout = 10
|
||||
retries = 3
|
||||
}
|
||||
}
|
||||
|
||||
# Firewall rule to allow LB access to control planes on 6443
|
||||
# This is added to the existing cluster firewall
|
||||
@@ -63,3 +63,8 @@ output "kubeconfig_command" {
|
||||
description = "Command to fetch kubeconfig"
|
||||
value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig"
|
||||
}
|
||||
|
||||
output "kube_api_lb_ip" {
|
||||
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
|
||||
value = hcloud_load_balancer_network.kube_api.ip
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ variable "cluster_name" {
|
||||
variable "control_plane_count" {
|
||||
description = "Number of control plane nodes"
|
||||
type = number
|
||||
default = 1
|
||||
default = 3
|
||||
}
|
||||
|
||||
variable "control_plane_type" {
|
||||
@@ -37,7 +37,7 @@ variable "control_plane_type" {
|
||||
variable "worker_count" {
|
||||
description = "Number of worker nodes"
|
||||
type = number
|
||||
default = 2
|
||||
default = 3
|
||||
}
|
||||
|
||||
variable "worker_type" {
|
||||
|
||||
Reference in New Issue
Block a user