Compare commits
115 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e9327b0c61 | |||
| cf49f8bf03 | |||
| d57e8c8fe8 | |||
| 93a2a42917 | |||
| 5cf68771dd | |||
| 6d6e3e8371 | |||
| 353a408dac | |||
| b3612083ad | |||
| 8c0dbd997d | |||
| 3a975a323c | |||
| d126de4dc4 | |||
| a33a993867 | |||
| f52e657f9f | |||
| f49b08f50c | |||
| 327bb860b7 | |||
| fd5451a5ef | |||
| 7333cb2780 | |||
| feecf97cd5 | |||
| b5bcec2663 | |||
| 0ad56405ee | |||
| d050e8962a | |||
| d925eeac3f | |||
| 2bde45e106 | |||
| 50752ca4b0 | |||
| a2ed9555c0 | |||
| 14462dd870 | |||
| 0625eee297 | |||
| 2dc4ab6329 | |||
| bbec0dfff4 | |||
| 6de826e030 | |||
| bdba2b7af2 | |||
| 499a3462e7 | |||
| daf6ccd0e4 | |||
| a6a630000a | |||
| ff9e58d44f | |||
| 8b94e4dd06 | |||
| 547a29e000 | |||
| 760f0482d4 | |||
| 440e268e4f | |||
| 24851f5a9b | |||
| ded8efe7fb | |||
| c10646d228 | |||
| 50d97209e6 | |||
| 46b2ff7d19 | |||
| a4f1d179e9 | |||
| 9879de5a86 | |||
| 195e9bce25 | |||
| 4796606432 | |||
| b1eab6a0fa | |||
| f3c96b65d2 | |||
| c7a375758f | |||
| d0be48b65c | |||
| 40647318b4 | |||
| cdb26904d2 | |||
| 3c06e046c2 | |||
| 17f1815e7f | |||
| 66e86e55ea | |||
| 43df412243 | |||
| 383ef9e9ac | |||
| 18abc5073b | |||
| f8da2594ca | |||
| e0359f0097 | |||
| 003333a061 | |||
| a6071c504b | |||
| 08123457f1 | |||
| 757d88ed52 | |||
| 15defc686f | |||
| abb7578328 | |||
| bc87a7ca43 | |||
| 045880bdd6 | |||
| bfcf57bcc5 | |||
| 7e3ebec95b | |||
| 0c31c3b1d5 | |||
| 5523feb563 | |||
| cafa2fa0b3 | |||
| a7fd4c0b97 | |||
| e56a3a6c38 | |||
| 7b2eca07ab | |||
| 347ca041ba | |||
| 3f52bad854 | |||
| c89c31adea | |||
| 68b293efe4 | |||
| 1f465cc0c1 | |||
| 6e22bd26b3 | |||
| 869880c152 | |||
| 31e95eb227 | |||
| 12675417bd | |||
| 8e081ddfda | |||
| 4b7517c9c5 | |||
| f9bc53723f | |||
| ee6417c18e | |||
| 1156dc0203 | |||
| 4151027e01 | |||
| 9269e9df1b | |||
| d9374bc209 | |||
| c570a476b5 | |||
| a7f11ccf94 | |||
| a7d540ca65 | |||
| 098bd98876 | |||
| 55d7b8201e | |||
| 9c0523e880 | |||
| 8372d562ad | |||
| 1bb11dfe3a | |||
| 624cd5aab6 | |||
| 71bdc6a709 | |||
| 714f20417b | |||
| c32bec34bc | |||
| 6519a7673d | |||
| d1c31cdb91 | |||
| b3e88712bd | |||
| 06366ee5e6 | |||
| 9a2d213114 | |||
| 9482a0f551 | |||
| 5c53b8e06e | |||
| b1dae28aa5 |
@@ -7,22 +7,28 @@ on:
|
||||
paths:
|
||||
- "ansible/dashboards.yml"
|
||||
- "ansible/roles/observability-content/**"
|
||||
- ".gitea/workflows/dashboards.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: prod-cluster
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VERSION: "1.14.9"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
dashboards:
|
||||
name: Grafana Content
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -31,6 +37,7 @@ jobs:
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
@@ -44,6 +51,7 @@ jobs:
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
@@ -51,29 +59,10 @@ jobs:
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Detect runner egress IP
|
||||
run: |
|
||||
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||
echo "Runner egress IP: ${RUNNER_IP}"
|
||||
|
||||
- name: Open SSH/API for current runner CIDR
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform apply \
|
||||
-refresh=false \
|
||||
-target=hcloud_firewall.cluster \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||
-auto-approve
|
||||
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||
pip3 install ansible==8.7.0 kubernetes==26.1.0 jinja2==3.1.5 pyyaml==6.0.2
|
||||
|
||||
- name: Install Ansible Collections
|
||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||
|
||||
+803
-209
File diff suppressed because it is too large
Load Diff
+44
-126
@@ -8,109 +8,28 @@ on:
|
||||
required: true
|
||||
default: ''
|
||||
|
||||
concurrency:
|
||||
group: prod-cluster
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VERSION: "1.14.9"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
pre-destroy-backup:
|
||||
name: Pre-Destroy Backup
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Get Control Plane IP
|
||||
id: cp_ip
|
||||
working-directory: terraform
|
||||
run: |
|
||||
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
||||
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Pre-Destroy pg_dump to B2
|
||||
run: |
|
||||
set +e
|
||||
echo "Attempting pre-destroy backup to B2..."
|
||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
|
||||
set -e
|
||||
# Check if kubectl is available and cluster is up
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "kubectl not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if we can reach the cluster
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo "Cannot reach cluster, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if CNP is deployed
|
||||
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
|
||||
echo "CNP namespace not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Run backup using the pgdump image directly
|
||||
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
|
||||
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
|
||||
|
||||
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
|
||||
echo "B2 credentials not found in secret, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
|
||||
-n cnpg-cluster --dry-run=client -o yaml | \
|
||||
kubectl apply -f -
|
||||
|
||||
echo "Waiting for backup job to complete..."
|
||||
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
|
||||
kubectl logs job/pgdump-manual -n cnpg-cluster || true
|
||||
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
|
||||
EOF
|
||||
echo "Pre-destroy backup step completed (failure is non-fatal)"
|
||||
|
||||
destroy:
|
||||
name: Destroy Cluster
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
needs: pre-destroy-backup
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -119,17 +38,7 @@ jobs:
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
@@ -139,10 +48,30 @@ jobs:
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Install jq
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y jq
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Save Proxmox target list
|
||||
run: |
|
||||
mkdir -p outputs
|
||||
if ! terraform -chdir=terraform output -json proxmox_target_vms > outputs/proxmox_target_vms.json; then
|
||||
terraform -chdir=terraform plan \
|
||||
-refresh=false \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-out=cleanup.tfplan \
|
||||
-no-color || true
|
||||
printf '[]' > outputs/proxmox_target_vms.json
|
||||
fi
|
||||
|
||||
- name: Terraform Destroy
|
||||
id: destroy
|
||||
@@ -152,7 +81,7 @@ jobs:
|
||||
for attempt in 1 2 3; do
|
||||
echo "Terraform destroy attempt ${attempt}/3"
|
||||
terraform destroy \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-parallelism=2 \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve
|
||||
@@ -164,32 +93,21 @@ jobs:
|
||||
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
|
||||
sleep 30
|
||||
terraform refresh \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
|
||||
fi
|
||||
done
|
||||
exit "$rc"
|
||||
|
||||
- name: Hetzner destroy diagnostics
|
||||
if: failure() && steps.destroy.outcome == 'failure'
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
|
||||
- name: Verify Proxmox target VMs removed
|
||||
if: success()
|
||||
run: |
|
||||
set +e
|
||||
echo "== Terraform state list =="
|
||||
terraform -chdir=terraform state list || true
|
||||
|
||||
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
|
||||
if [ -z "$network_id" ]; then
|
||||
network_id="11988935"
|
||||
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --targets-file outputs/proxmox_target_vms.json
|
||||
if [ -f terraform/cleanup.tfplan ]; then
|
||||
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --terraform-dir terraform --plan cleanup.tfplan
|
||||
fi
|
||||
|
||||
echo "== Hetzner network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
|
||||
|
||||
echo "== Hetzner servers attached to network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||
|
||||
echo "== Hetzner load balancers attached to network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||
- name: Terraform state diagnostics
|
||||
if: failure() && steps.destroy.outcome == 'failure'
|
||||
run: |
|
||||
terraform -chdir=terraform state list || true
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
*.tfstate.*
|
||||
*.tfstate.backup
|
||||
.terraform/
|
||||
.terraform.lock.hcl
|
||||
terraform.tfvars
|
||||
crash.log
|
||||
override.tf
|
||||
|
||||
@@ -1,48 +1,57 @@
|
||||
# AGENTS.md
|
||||
|
||||
Repository guide for OpenCode sessions in this repo.
|
||||
Compact repo guidance for OpenCode sessions. Trust executable sources over docs when they conflict.
|
||||
|
||||
## Read First
|
||||
|
||||
- Trust manifests and workflows over prose when they conflict.
|
||||
- Highest-value sources: `terraform/main.tf`, `terraform/variables.tf`, `ansible/site.yml`, `clusters/prod/flux-system/`, `infrastructure/addons/kustomization.yaml`, `.gitea/workflows/deploy.yml`, `.gitea/workflows/destroy.yml`, `README.md`, `STABLE_BASELINE.md`, `scripts/refresh-kubeconfig.sh`, `scripts/smoke-check-tailnet-services.sh`.
|
||||
- Highest-value sources: `.gitea/workflows/deploy.yml`, `.gitea/workflows/destroy.yml`, `terraform/main.tf`, `terraform/variables.tf`, `terraform/servers.tf`, `ansible/site.yml`, `ansible/inventory.tmpl`, `clusters/prod/flux-system/`, `infrastructure/addons/kustomization.yaml`.
|
||||
- `STABLE_BASELINE.md` still contains stale Rancher backup/restore references; current workflows and addon manifests do not deploy or restore `rancher-backup`.
|
||||
|
||||
## Current Baseline
|
||||
## Baseline
|
||||
|
||||
- HA private cluster: 3 control planes, 3 workers.
|
||||
- Tailscale is the private access path for Rancher and shared services.
|
||||
- Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed.
|
||||
- `apps/` is suspended by default.
|
||||
- Rancher stores state in embedded etcd; backup/restore uses `rancher-backup` to B2.
|
||||
- Proxmox HA K3s cluster: 3 control planes, 5 workers, VMIDs `200-202` and `210-214`, node `flex`, template VMID `9000`, datastore `Flash`.
|
||||
- API HA is kube-vip at `10.27.27.40`; control planes are `10.27.27.30-32`, workers are `10.27.27.41-45`.
|
||||
- SSH user is `ubuntu`; Ansible derives the flannel iface from `ansible_default_ipv4.interface` with `eth0` fallback, so do not hard-code `ens18`.
|
||||
- Storage is raw-manifest `nfs-subdir-external-provisioner` using `10.27.27.239:/TheFlash/k8s-nfs` and default StorageClass `flash-nfs`.
|
||||
- Tailscale is the private access path. Rancher, Grafana, and Prometheus are exposed only through Tailscale services.
|
||||
- `apps` is intentionally suspended in `clusters/prod/flux-system/kustomization-apps.yaml`.
|
||||
|
||||
## Common Commands
|
||||
## Commands
|
||||
|
||||
- Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
|
||||
- Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml`
|
||||
- Flux/Kustomize: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize clusters/prod/flux-system`
|
||||
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
|
||||
- Tailnet smoke check: `ssh root@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`
|
||||
- Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`.
|
||||
- Ansible setup: `ansible-galaxy collection install -r ansible/requirements.yml`, then from `ansible/` run `python3 generate_inventory.py` and `ansible-playbook site.yml --syntax-check`.
|
||||
- Flux/Kustomize checks: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize infrastructure/addons`, `kubectl kustomize clusters/prod/flux-system`.
|
||||
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-ip>`; use this if local `kubectl` falls back to `localhost:8080` after rebuilds.
|
||||
- Tailnet smoke check from cp1: `ssh ubuntu@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`.
|
||||
- Fast Grafana content iteration uses `.gitea/workflows/dashboards.yml` and `ansible/dashboards.yml`, not a full cluster rebuild.
|
||||
|
||||
## Workflow Rules
|
||||
## Deploy Flow
|
||||
|
||||
- Keep diffs small and validate only the directory you edited.
|
||||
- Update manifests and docs together when behavior changes.
|
||||
- Use `set -euo pipefail` in workflow shell blocks.
|
||||
- CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks.
|
||||
- One object per Kubernetes YAML file; keep filenames kebab-case.
|
||||
- If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP.
|
||||
- Pushes to `main` run Gitea CI: Terraform fmt/init/validate/plan/apply, Proxmox cleanup/retry, Ansible bootstrap, Flux bootstrap, addon gates, Rancher gate, observability image seeding, health checks, tailnet smoke checks.
|
||||
- Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate.
|
||||
- Keep `set -euo pipefail` in workflow shell blocks.
|
||||
- Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs.
|
||||
- Fresh VMs have unreliable registry/chart egress, so critical images are prepared by `skopeo` on the runner and imported with `k3s ctr`; update the workflow archive lists when adding bootstrap-time images.
|
||||
- CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap.
|
||||
|
||||
## Repo-Specific Gotchas
|
||||
## GitOps Addons
|
||||
|
||||
- `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR.
|
||||
- Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted.
|
||||
- Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it.
|
||||
- Current private URLs:
|
||||
- Rancher: `https://rancher.silverside-gopher.ts.net/`
|
||||
- Grafana: `http://grafana.silverside-gopher.ts.net/`
|
||||
- Prometheus: `http://prometheus.silverside-gopher.ts.net:9090/`
|
||||
- Vendored charts are intentional: `infrastructure/charts/{cert-manager,traefik,kube-prometheus-stack,tailscale-operator,rancher}`. Do not restore remote `HelmRepository` objects unless cluster-side chart fetch reliability is intentionally changed.
|
||||
- External Secrets and Loki/Promtail use Flux `OCIRepository`; Rancher, Tailscale, cert-manager, Traefik, and kube-prometheus-stack use `GitRepository` chart paths.
|
||||
- Use fully qualified `helmchart.source.toolkit.fluxcd.io/...` in scripts; K3s also has `helmcharts.helm.cattle.io`, so `helmchart/...` can target the wrong resource.
|
||||
- `doppler-bootstrap` only creates the `external-secrets` namespace and Doppler token secret. The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
|
||||
- The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by that addon kustomization; do not assume Flux applies it.
|
||||
- Keep Kubernetes manifests one object per file with kebab-case filenames.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`; K3s `latest` can break Rancher. Role defaults pin `v1.34.6+k3s1`; do not reintroduce a generated-inventory `k3s_version=latest` override.
|
||||
- The repo no longer uses a cloud controller manager. `providerID`, Hetzner CCM/CSI, or Hetzner firewall/load-balancer logic is stale.
|
||||
- Tailscale cleanup must only remove stale offline reserved hostnames before live service proxies exist; do not delete active `rancher`, `grafana`, `prometheus`, or `flux` devices.
|
||||
- Proxmox endpoint should be the base URL, for example `https://100.105.0.115:8006/`; provider/workflow code strips `/api2/json` when needed.
|
||||
- Current private URLs: Rancher `https://rancher.silverside-gopher.ts.net/`, Grafana `http://grafana.silverside-gopher.ts.net/`, Prometheus `http://prometheus.silverside-gopher.ts.net:9090/`.
|
||||
|
||||
## Secrets
|
||||
|
||||
- Runtime secrets live in Doppler + External Secrets.
|
||||
- Bootstrap and CI secrets stay in Gitea; never commit secrets, kubeconfigs, or private keys.
|
||||
- Runtime secrets are Doppler + External Secrets; Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
|
||||
- Never commit secrets, kubeconfigs, private keys, `terraform.tfvars`, or generated `outputs/` artifacts.
|
||||
|
||||
@@ -0,0 +1,287 @@
|
||||
# App Repo Deployment Guide
|
||||
|
||||
This guide explains the recommended way to deploy an application to this cluster.
|
||||
|
||||
## Recommended Model
|
||||
|
||||
Use two repos:
|
||||
|
||||
- `HetznerTerra` (this repo): cluster, addons, shared infrastructure, Flux wiring
|
||||
- `your-app-repo`: application source, Dockerfile, CI, Kubernetes manifests or Helm chart
|
||||
|
||||
Why:
|
||||
|
||||
- cluster lifecycle stays separate from app code
|
||||
- app CI can build and tag images independently
|
||||
- this repo remains the source of truth for what the cluster is allowed to deploy
|
||||
|
||||
## Current Cluster Assumptions
|
||||
|
||||
- Flux is already installed and reconciles this repo from `main`
|
||||
- `clusters/prod/flux-system/kustomization-apps.yaml` points at `./apps`
|
||||
- `apps` is suspended by default
|
||||
- private access is through Tailscale
|
||||
- runtime secrets should come from Doppler via External Secrets
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Option A: Separate app repo
|
||||
|
||||
Recommended for most real applications.
|
||||
|
||||
Flow:
|
||||
|
||||
1. App repo builds and pushes an image.
|
||||
2. This repo defines a `GitRepository` pointing at the app repo.
|
||||
3. This repo defines a `Kustomization` pointing at a path in the app repo.
|
||||
4. Flux pulls the app repo and applies the manifests.
|
||||
|
||||
### Option B: In-repo app manifests
|
||||
|
||||
Only use this when the application is tiny or tightly coupled to the platform.
|
||||
|
||||
Flow:
|
||||
|
||||
1. Put Kubernetes manifests directly under `apps/` in this repo.
|
||||
2. Unsuspend the top-level `apps` Kustomization.
|
||||
|
||||
This is simpler, but mixes platform and app changes together.
|
||||
|
||||
## App Repo Structure
|
||||
|
||||
Suggested layout:
|
||||
|
||||
```text
|
||||
your-app-repo/
|
||||
├── src/
|
||||
├── Dockerfile
|
||||
├── .gitea/workflows/
|
||||
└── deploy/
|
||||
├── base/
|
||||
│ ├── namespace.yaml
|
||||
│ ├── deployment.yaml
|
||||
│ ├── service.yaml
|
||||
│ ├── externalsecret.yaml
|
||||
│ └── kustomization.yaml
|
||||
└── prod/
|
||||
├── kustomization.yaml
|
||||
└── patch-*.yaml
|
||||
```
|
||||
|
||||
If you prefer Helm, replace `deploy/base` and `deploy/prod` with a chart path and point Flux at that instead.
|
||||
|
||||
## What the App Repo Should Own
|
||||
|
||||
- application source code
|
||||
- image build pipeline
|
||||
- image tag strategy
|
||||
- Deployment / Service / Ingress or Tailscale-facing Service manifests
|
||||
- app-specific `ExternalSecret` manifests
|
||||
- app-specific namespace
|
||||
|
||||
## What This Repo Should Own
|
||||
|
||||
- cluster-level permission to deploy the app
|
||||
- the `GitRepository` and top-level `Kustomization` that attach the app repo to the cluster
|
||||
- whether the `apps` layer is suspended or active
|
||||
|
||||
## Recommended First App Integration
|
||||
|
||||
In this repo, add Flux objects under `apps/` that point to the app repo.
|
||||
|
||||
Example files to add:
|
||||
|
||||
- `apps/gitrepository-my-app.yaml`
|
||||
- `apps/kustomization-my-app.yaml`
|
||||
- update `apps/kustomization.yaml`
|
||||
|
||||
Example `apps/gitrepository-my-app.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: GitRepository
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m
|
||||
ref:
|
||||
branch: main
|
||||
secretRef:
|
||||
name: flux-system
|
||||
url: ssh://git@<your-git-host>:<port>/<org>/<your-app-repo>.git
|
||||
```
|
||||
|
||||
Example `apps/kustomization-my-app.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: my-app
|
||||
path: ./deploy/prod
|
||||
wait: true
|
||||
timeout: 5m
|
||||
dependsOn:
|
||||
- name: infrastructure
|
||||
```
|
||||
|
||||
Then update `apps/kustomization.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- gitrepository-my-app.yaml
|
||||
- kustomization-my-app.yaml
|
||||
```
|
||||
|
||||
## App Secrets
|
||||
|
||||
Recommended path:
|
||||
|
||||
1. Put runtime values in Doppler.
|
||||
2. In the app manifests, create an `ExternalSecret` that reads from `doppler-hetznerterra`.
|
||||
3. Reference the resulting Kubernetes Secret from the Deployment.
|
||||
|
||||
Example app-side `ExternalSecret`:
|
||||
|
||||
```yaml
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: my-app-env
|
||||
namespace: my-app
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: my-app-env
|
||||
creationPolicy: Owner
|
||||
data:
|
||||
- secretKey: DATABASE_URL
|
||||
remoteRef:
|
||||
key: MY_APP_DATABASE_URL
|
||||
```
|
||||
|
||||
## Image Delivery
|
||||
|
||||
Recommended flow:
|
||||
|
||||
1. App repo CI builds a container image.
|
||||
2. CI pushes it to a registry.
|
||||
3. The app repo updates the Kubernetes image tag in `deploy/prod`.
|
||||
4. Flux notices the Git change and deploys it.
|
||||
|
||||
Keep the first version simple. Do not add image automation until the basic deploy path is proven.
|
||||
|
||||
## Exposing the App
|
||||
|
||||
Pick one:
|
||||
|
||||
### Private app over Tailscale
|
||||
|
||||
Best fit for this cluster right now.
|
||||
|
||||
Create a Service like the existing Rancher/Grafana/Prometheus pattern:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: my-app-tailscale
|
||||
namespace: my-app
|
||||
annotations:
|
||||
tailscale.com/hostname: my-app
|
||||
tailscale.com/tags: "tag:prod"
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: my-app
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 3000
|
||||
```
|
||||
|
||||
Use `http://my-app.<your-tailnet>` or your chosen hostname.
|
||||
|
||||
### Cluster-internal only
|
||||
|
||||
Create only a `ClusterIP` Service.
|
||||
|
||||
### Public ingress
|
||||
|
||||
Not recommended as the first app path in this repo. Get the private path working first.
|
||||
|
||||
## Enabling the Apps Layer
|
||||
|
||||
The cluster-wide `apps` Kustomization is suspended by default.
|
||||
|
||||
When you are ready to let Flux deploy app attachments from `apps/`, unsuspend it:
|
||||
|
||||
```bash
|
||||
kubectl -n flux-system patch kustomization apps --type=merge -p '{"spec":{"suspend":false}}'
|
||||
```
|
||||
|
||||
Or commit a change to `clusters/prod/flux-system/kustomization-apps.yaml` changing:
|
||||
|
||||
```yaml
|
||||
suspend: true
|
||||
```
|
||||
|
||||
to:
|
||||
|
||||
```yaml
|
||||
suspend: false
|
||||
```
|
||||
|
||||
## First Deploy Checklist
|
||||
|
||||
Before deploying the first app, make sure:
|
||||
|
||||
1. app image builds successfully
|
||||
2. app repo contains valid `deploy/prod` manifests
|
||||
3. this repo contains the `GitRepository` + `Kustomization` attachment objects
|
||||
4. required Doppler secrets exist
|
||||
5. `apps` is unsuspended if you are using the top-level `apps` layer
|
||||
|
||||
## Verification Commands
|
||||
|
||||
From a machine with cluster access:
|
||||
|
||||
```bash
|
||||
kubectl -n flux-system get gitrepositories,kustomizations
|
||||
kubectl get ns
|
||||
kubectl -n my-app get deploy,svc,pods,externalsecret,secret
|
||||
```
|
||||
|
||||
If private over Tailscale:
|
||||
|
||||
```bash
|
||||
kubectl -n my-app get svc my-app-tailscale -o wide
|
||||
```
|
||||
|
||||
## Minimal Recommendation
|
||||
|
||||
If you want the simplest, lowest-risk first deploy:
|
||||
|
||||
1. create a separate app repo
|
||||
2. add `deploy/base` + `deploy/prod`
|
||||
3. add a `GitRepository` + `Kustomization` in this repo under `apps/`
|
||||
4. keep the app private with a Tailscale `LoadBalancer` Service
|
||||
5. use Doppler + `ExternalSecret` for runtime config
|
||||
|
||||
That matches the current cluster design with the least surprise.
|
||||
@@ -1,296 +1,268 @@
|
||||
# Hetzner Kubernetes Cluster
|
||||
# Proxmox Kubernetes Cluster
|
||||
|
||||
Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible.
|
||||
Private HA K3s cluster on Proxmox, provisioned by Terraform, bootstrapped by Ansible, and reconciled by Flux.
|
||||
|
||||
## Architecture
|
||||
|
||||
| Component | Details |
|
||||
|-----------|---------|
|
||||
| **Control Plane** | 3x CX23 (HA) |
|
||||
| **Workers** | 3x CX33 |
|
||||
| **K8s** | k3s (latest, HA) |
|
||||
| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
|
||||
| **Access** | SSH/API and private services restricted to Tailnet |
|
||||
| **Bootstrap** | Terraform + Ansible + Flux |
|
||||
| Component | Current Baseline |
|
||||
|-----------|------------------|
|
||||
| **Control plane** | 3 Proxmox VMs, VMIDs `200-202`, IPs `10.27.27.30-32`, 2 vCPU / 4 GiB / 32 GiB |
|
||||
| **Workers** | 5 Proxmox VMs, VMIDs `210-214`, IPs `10.27.27.41-45`, 4 vCPU / 8 GiB / 64 GiB |
|
||||
| **Kubernetes** | K3s `v1.34.6+k3s1`, HA embedded etcd, kube-vip API VIP `10.27.27.40` |
|
||||
| **Proxmox** | Node `flex`, template VMID `9000`, datastore `Flash`, bridge `vmbr0` |
|
||||
| **Storage** | Raw-manifest `nfs-subdir-external-provisioner`, `10.27.27.239:/TheFlash/k8s-nfs`, default StorageClass `flash-nfs` |
|
||||
| **GitOps** | Flux source `platform` on branch `main`; `apps` Kustomization is intentionally suspended |
|
||||
| **Private access** | Tailscale operator exposes Rancher, Grafana, and Prometheus; no public ingress baseline |
|
||||
| **Runtime secrets** | Doppler service token bootstraps External Secrets Operator |
|
||||
|
||||
K3s is pinned because Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. Hetzner Cloud API Token
|
||||
- Terraform `>= 1.0`.
|
||||
- Ansible with Python `jinja2` and `pyyaml`.
|
||||
- `kubectl` for local verification.
|
||||
- Proxmox API token for the `bpg/proxmox` provider.
|
||||
- S3-compatible bucket for Terraform state, currently Backblaze B2.
|
||||
- SSH key pair available to Terraform and Ansible, defaulting to `~/.ssh/infra` and `~/.ssh/infra.pub`.
|
||||
|
||||
1. Go to [Hetzner Cloud Console](https://console.hetzner.com/)
|
||||
2. Select your project (or create a new one)
|
||||
3. Navigate to **Security** → **API Tokens**
|
||||
4. Click **Generate API Token**
|
||||
5. Set description: `k8s-cluster-terraform`
|
||||
6. Select permissions: **Read & Write**
|
||||
7. Click **Generate API Token**
|
||||
8. **Copy the token immediately** - it won't be shown again!
|
||||
Expected Proxmox inputs:
|
||||
|
||||
### 2. Backblaze B2 Bucket (for Terraform State)
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| Endpoint | `https://100.105.0.115:8006/` |
|
||||
| Node | `flex` |
|
||||
| Clone source | Template VMID `9000` (`ubuntu-2404-k8s-template`) |
|
||||
| Storage | `Flash` |
|
||||
|
||||
1. Go to [Backblaze B2](https://secure.backblaze.com/b2_buckets.htm)
|
||||
2. Click **Create a Bucket**
|
||||
3. Set bucket name: `k8s-terraform-state` (must be globally unique)
|
||||
4. Choose **Private** access
|
||||
5. Click **Create Bucket**
|
||||
6. Create application key:
|
||||
- Go to **App Keys** → **Add a New Application Key**
|
||||
- Name: `terraform-state`
|
||||
- Allow access to: `k8s-terraform-state` bucket only
|
||||
- Type: **Read and Write**
|
||||
- Copy **keyID** (access key) and **applicationKey** (secret key)
|
||||
7. Note your bucket's S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`)
|
||||
## Local Setup
|
||||
|
||||
### 3. SSH Key Pair
|
||||
|
||||
```bash
|
||||
ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s
|
||||
```
|
||||
|
||||
### 4. Local Tools
|
||||
|
||||
- [Terraform](https://terraform.io/downloads) >= 1.0
|
||||
- [Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html) >= 2.9
|
||||
- Python 3 with `jinja2` and `pyyaml`
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Clone Repository
|
||||
|
||||
```bash
|
||||
git clone <your-gitea-repo>/HetznerTerra.git
|
||||
cd HetznerTerra
|
||||
```
|
||||
|
||||
### 2. Configure Variables
|
||||
Create local variables from the example:
|
||||
|
||||
```bash
|
||||
cp terraform.tfvars.example terraform.tfvars
|
||||
```
|
||||
|
||||
Edit `terraform.tfvars`:
|
||||
Important defaults in `terraform.tfvars.example`:
|
||||
|
||||
```hcl
|
||||
hcloud_token = "your-hetzner-api-token"
|
||||
proxmox_endpoint = "https://100.105.0.115:8006/"
|
||||
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
|
||||
proxmox_api_token_secret = "your-proxmox-api-token-secret"
|
||||
|
||||
ssh_public_key = "~/.ssh/hetzner_k8s.pub"
|
||||
ssh_private_key = "~/.ssh/hetzner_k8s"
|
||||
ssh_public_key = "~/.ssh/infra.pub"
|
||||
ssh_private_key = "~/.ssh/infra"
|
||||
|
||||
s3_access_key = "your-backblaze-key-id"
|
||||
s3_secret_key = "your-backblaze-application-key"
|
||||
s3_endpoint = "https://s3.eu-central-003.backblazeb2.com"
|
||||
s3_bucket = "k8s-terraform-state"
|
||||
|
||||
tailscale_auth_key = "tskey-auth-..."
|
||||
tailscale_tailnet = "yourtailnet.ts.net"
|
||||
|
||||
restrict_api_ssh_to_tailnet = true
|
||||
tailnet_cidr = "100.64.0.0/10"
|
||||
enable_nodeport_public = false
|
||||
|
||||
allowed_ssh_ips = []
|
||||
allowed_api_ips = []
|
||||
tailscale_tailnet = "yourtailnet.ts.net"
|
||||
kube_api_vip = "10.27.27.40"
|
||||
```
|
||||
|
||||
### 3. Initialize Terraform
|
||||
Initialize Terraform with backend credentials:
|
||||
|
||||
```bash
|
||||
cd terraform
|
||||
|
||||
# Create backend config file (or use CLI args)
|
||||
cat > backend.hcl << EOF
|
||||
endpoint = "https://s3.eu-central-003.backblazeb2.com"
|
||||
bucket = "k8s-terraform-state"
|
||||
access_key = "your-backblaze-key-id"
|
||||
secret_key = "your-backblaze-application-key"
|
||||
skip_requesting_account_id = true
|
||||
EOF
|
||||
|
||||
terraform init -backend-config=backend.hcl
|
||||
terraform -chdir=terraform init \
|
||||
-backend-config="endpoint=<s3-endpoint>" \
|
||||
-backend-config="bucket=<s3-bucket>" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=<s3-access-key>" \
|
||||
-backend-config="secret_key=<s3-secret-key>" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
```
|
||||
|
||||
### 4. Plan and Apply
|
||||
## Common Commands
|
||||
|
||||
Terraform:
|
||||
|
||||
```bash
|
||||
terraform plan -var-file=../terraform.tfvars
|
||||
terraform apply -var-file=../terraform.tfvars
|
||||
terraform -chdir=terraform fmt -recursive
|
||||
terraform -chdir=terraform validate
|
||||
terraform -chdir=terraform plan -var-file=../terraform.tfvars
|
||||
terraform -chdir=terraform apply -var-file=../terraform.tfvars
|
||||
```
|
||||
|
||||
### 5. Generate Ansible Inventory
|
||||
Ansible setup:
|
||||
|
||||
```bash
|
||||
cd ../ansible
|
||||
ansible-galaxy collection install -r ansible/requirements.yml
|
||||
cd ansible
|
||||
python3 generate_inventory.py
|
||||
ansible-playbook site.yml --syntax-check
|
||||
```
|
||||
|
||||
### 6. Bootstrap Cluster
|
||||
Manual Ansible bootstrap uses the same extra vars as the deploy workflow:
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml
|
||||
cd ansible
|
||||
ansible-playbook site.yml \
|
||||
-e "tailscale_auth_key=$TAILSCALE_AUTH_KEY" \
|
||||
-e "tailscale_tailnet=$TAILSCALE_TAILNET" \
|
||||
-e "tailscale_oauth_client_id=$TAILSCALE_OAUTH_CLIENT_ID" \
|
||||
-e "tailscale_oauth_client_secret=$TAILSCALE_OAUTH_CLIENT_SECRET" \
|
||||
-e "doppler_hetznerterra_service_token=$DOPPLER_HETZNERTERRA_SERVICE_TOKEN" \
|
||||
-e "tailscale_api_key=${TAILSCALE_API_KEY:-}" \
|
||||
-e "grafana_admin_password=${GRAFANA_ADMIN_PASSWORD:-}" \
|
||||
-e "cluster_name=k8s-cluster"
|
||||
```
|
||||
|
||||
### 7. Get Kubeconfig
|
||||
Flux/Kustomize verification:
|
||||
|
||||
```bash
|
||||
kubectl kustomize infrastructure/addons/<addon>
|
||||
kubectl kustomize infrastructure/addons
|
||||
kubectl kustomize clusters/prod/flux-system
|
||||
```
|
||||
|
||||
Refresh kubeconfig after rebuilds:
|
||||
|
||||
```bash
|
||||
scripts/refresh-kubeconfig.sh 10.27.27.30
|
||||
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
Use `scripts/refresh-kubeconfig.sh <cp1-public-ip>` to refresh kubeconfig against the primary control-plane public IP after rebuilds.
|
||||
Run the tailnet smoke check from cp1:
|
||||
|
||||
```bash
|
||||
ssh ubuntu@10.27.27.30 'bash -s' < scripts/smoke-check-tailnet-services.sh
|
||||
```
|
||||
|
||||
## Gitea CI/CD
|
||||
|
||||
This repository includes Gitea workflows for:
|
||||
The supported full rebuild path is the Gitea deploy workflow.
|
||||
|
||||
- **deploy**: End-to-end Terraform + Ansible + Flux bootstrap + restore + health checks
|
||||
- **destroy**: Cluster teardown with backup-aware cleanup
|
||||
- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
|
||||
| Workflow | Trigger | Purpose |
|
||||
|----------|---------|---------|
|
||||
| `.gitea/workflows/deploy.yml` | PR to `main`, push to `main`, manual dispatch | PRs run Terraform plan; pushes run Terraform apply, Ansible bootstrap, Flux bootstrap, addon gates, health checks, and tailnet smoke checks |
|
||||
| `.gitea/workflows/destroy.yml` | Manual dispatch with `confirm: destroy` | Terraform destroy with retries; no Rancher backup gate |
|
||||
| `.gitea/workflows/dashboards.yml` | Grafana content changes or manual dispatch | Fast Grafana datasource/dashboard update through `ansible/dashboards.yml` |
|
||||
|
||||
### Required Gitea Secrets
|
||||
Deploy and destroy share `concurrency.group: prod-cluster` so they do not run at the same time.
|
||||
|
||||
Set these in your Gitea repository settings (**Settings** → **Secrets** → **Actions**):
|
||||
Deploy sequence on push to `main`:
|
||||
|
||||
1. Terraform fmt/init/validate/plan/apply.
|
||||
2. Cleanup/retry around known transient Proxmox clone and disk-update failures.
|
||||
3. Generate Ansible inventory from Terraform outputs.
|
||||
4. Prepare critical image archives with `skopeo` on the runner.
|
||||
5. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig.
|
||||
6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph.
|
||||
7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability.
|
||||
8. Run post-deploy health checks and Tailscale service smoke checks.
|
||||
|
||||
Required Gitea secrets:
|
||||
|
||||
| Secret | Description |
|
||||
|--------|-------------|
|
||||
| `HCLOUD_TOKEN` | Hetzner Cloud API token |
|
||||
| `S3_ACCESS_KEY` | Backblaze B2 keyID |
|
||||
| `S3_SECRET_KEY` | Backblaze B2 applicationKey |
|
||||
| `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) |
|
||||
| `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
|
||||
| `PROXMOX_ENDPOINT` | Proxmox API endpoint, for example `https://100.105.0.115:8006/` |
|
||||
| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID |
|
||||
| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret |
|
||||
| `S3_ACCESS_KEY` | S3/Backblaze access key for Terraform state |
|
||||
| `S3_SECRET_KEY` | S3/Backblaze secret key for Terraform state |
|
||||
| `S3_ENDPOINT` | S3 endpoint, for example `https://s3.eu-central-003.backblazeb2.com` |
|
||||
| `S3_BUCKET` | Terraform state bucket, for example `k8s-terraform-state` |
|
||||
| `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
|
||||
| `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
|
||||
| `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for Kubernetes Operator |
|
||||
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
|
||||
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
|
||||
| `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
|
||||
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
|
||||
| `TAILSCALE_TAILNET` | Tailnet domain, for example `silverside-gopher.ts.net` |
|
||||
| `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for the Kubernetes operator |
|
||||
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for the Kubernetes operator |
|
||||
| `TAILSCALE_API_KEY` | Optional API key used to delete stale offline reserved devices before service proxies exist |
|
||||
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for runtime cluster secrets |
|
||||
| `GRAFANA_ADMIN_PASSWORD` | Optional Grafana admin password |
|
||||
| `SSH_PUBLIC_KEY` | SSH public key content |
|
||||
| `SSH_PRIVATE_KEY` | SSH private key content |
|
||||
|
||||
## GitOps (Flux)
|
||||
## GitOps Graph
|
||||
|
||||
This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap.
|
||||
Flux entrypoint:
|
||||
|
||||
### Stable private-only baseline
|
||||
```text
|
||||
clusters/prod/flux-system/
|
||||
├── gotk-components.yaml
|
||||
├── gitrepository-platform.yaml
|
||||
├── kustomization-infrastructure.yaml
|
||||
└── kustomization-apps.yaml # suspend: true
|
||||
```
|
||||
|
||||
The current default target is the HA private baseline:
|
||||
Active infrastructure addons from `infrastructure/addons/kustomization.yaml`:
|
||||
|
||||
- `3` control plane nodes
|
||||
- `3` worker nodes
|
||||
- private Hetzner network only
|
||||
- Tailscale for operator and service access
|
||||
- Flux-managed platform addons with `apps` suspended by default
|
||||
- `addon-nfs-storage`
|
||||
- `addon-external-secrets`
|
||||
- `addon-cert-manager`
|
||||
- `addon-tailscale-operator`
|
||||
- `addon-tailscale-proxyclass`
|
||||
- `traefik` HelmRelease manifests applied directly by the top-level infrastructure Kustomization
|
||||
- `addon-observability`
|
||||
- `addon-observability-content`
|
||||
- `addon-rancher`
|
||||
- `addon-rancher-config`
|
||||
|
||||
Detailed phase gates and success criteria live in `STABLE_BASELINE.md`.
|
||||
Chart/source strategy:
|
||||
|
||||
This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later.
|
||||
- Vendored charts are intentional: `cert-manager`, `traefik`, `kube-prometheus-stack`, `tailscale-operator`, and `rancher` live under `infrastructure/charts/`.
|
||||
- External Secrets, Loki, and Promtail use Flux `OCIRepository` sources.
|
||||
- NFS storage is raw Kubernetes manifests, not a Helm chart.
|
||||
- Rancher backup/restore is not part of the current live graph.
|
||||
|
||||
### Runtime secrets
|
||||
Doppler bootstrap details:
|
||||
|
||||
Runtime cluster secrets are moving to Doppler + External Secrets Operator.
|
||||
- `ansible/roles/doppler-bootstrap` creates the `external-secrets` namespace and the Doppler token secret only.
|
||||
- The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
|
||||
- The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by the addon kustomization.
|
||||
|
||||
- Doppler project: `hetznerterra`
|
||||
- Initial auth: service token via `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
|
||||
- First synced secrets:
|
||||
- `GRAFANA_ADMIN_PASSWORD`
|
||||
## Access URLs
|
||||
|
||||
Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed by Doppler.
|
||||
| Service | URL |
|
||||
|---------|-----|
|
||||
| Rancher | `https://rancher.silverside-gopher.ts.net/` |
|
||||
| Grafana | `http://grafana.silverside-gopher.ts.net/` |
|
||||
| Prometheus | `http://prometheus.silverside-gopher.ts.net:9090/` |
|
||||
|
||||
### Repository layout
|
||||
|
||||
- `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
|
||||
- `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
|
||||
- `infrastructure/`: infrastructure addon reconciliation graph
|
||||
- `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
|
||||
- `apps/`: application workload layer (currently scaffolded)
|
||||
|
||||
### Reconciliation graph
|
||||
|
||||
- `infrastructure` (top-level)
|
||||
- `addon-ccm`
|
||||
- `addon-csi` depends on `addon-ccm`
|
||||
- `addon-tailscale-operator`
|
||||
- `addon-observability`
|
||||
- `addon-observability-content` depends on `addon-observability`
|
||||
- `apps` depends on `infrastructure`
|
||||
|
||||
### Bootstrap notes
|
||||
|
||||
1. Install Flux controllers in `flux-system`.
|
||||
2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
|
||||
3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
|
||||
4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.
|
||||
|
||||
### Current addon status
|
||||
|
||||
- Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
|
||||
- Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`.
|
||||
- `apps` remains suspended until workload rollout is explicitly enabled.
|
||||
- Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization.
|
||||
- Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations.
|
||||
|
||||
### Rancher access
|
||||
|
||||
- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`.
|
||||
- The public Hetzner load balancer path is not used for Rancher.
|
||||
- Rancher stores state in embedded etcd; no external database is used.
|
||||
|
||||
### Stable baseline acceptance
|
||||
|
||||
A rebuild is considered successful only when all of the following pass without manual intervention:
|
||||
|
||||
- Terraform create succeeds for the default `3` control planes and `3` workers.
|
||||
- Ansible bootstrap succeeds end-to-end.
|
||||
- All nodes become `Ready`.
|
||||
- Flux core reconciliation is healthy.
|
||||
- External Secrets Operator is ready.
|
||||
- Tailscale operator is ready.
|
||||
- Tailnet smoke checks pass for Rancher, Grafana, and Prometheus.
|
||||
- Terraform destroy succeeds cleanly or succeeds after workflow retries.
|
||||
|
||||
## Observability Stack
|
||||
|
||||
Flux deploys a lightweight observability stack in the `observability` namespace:
|
||||
|
||||
- `kube-prometheus-stack` (Prometheus + Grafana)
|
||||
- `loki`
|
||||
- `promtail`
|
||||
|
||||
Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/`.
|
||||
|
||||
Grafana and Prometheus are exposed through dedicated Tailscale LoadBalancer services when the Tailscale Kubernetes Operator is healthy.
|
||||
|
||||
### Access Grafana and Prometheus
|
||||
|
||||
Preferred private access:
|
||||
|
||||
- Grafana: `http://grafana.silverside-gopher.ts.net/`
|
||||
- Prometheus: `http://prometheus.silverside-gopher.ts.net:9090/`
|
||||
|
||||
Fallback (port-forward from a tailnet-connected machine):
|
||||
|
||||
Run from a tailnet-connected machine:
|
||||
Fallback port-forward from a tailnet-connected machine:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||
|
||||
kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
|
||||
kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
|
||||
```
|
||||
|
||||
Then open:
|
||||
Grafana user is `admin`; password comes from the `GRAFANA_ADMIN_PASSWORD` Doppler secret or the workflow-provided fallback.
|
||||
|
||||
- Grafana: http://127.0.0.1:3000
|
||||
- Prometheus: http://127.0.0.1:9090
|
||||
## Operations
|
||||
|
||||
Grafana user: `admin`
|
||||
Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output)
|
||||
Scale workers by updating `terraform.tfvars` counts, IP lists, and VMID lists together. If node names or VMIDs change, also update the hard-coded retry cleanup target map in `.gitea/workflows/deploy.yml`.
|
||||
|
||||
### Verify Tailscale exposure
|
||||
Upgrade K3s by changing the role defaults in `ansible/roles/k3s-server/defaults/main.yml` and `ansible/roles/k3s-agent/defaults/main.yml`. Check Rancher chart compatibility before moving to a Kubernetes minor outside `<1.35.0-0`.
|
||||
|
||||
Destroy through the Gitea `Destroy` workflow with `confirm: destroy`, or locally with:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||
terraform -chdir=terraform destroy -var-file=../terraform.tfvars
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Check K3s from cp1:
|
||||
|
||||
```bash
|
||||
ssh ubuntu@10.27.27.30 'sudo k3s kubectl get nodes -o wide'
|
||||
ssh ubuntu@10.27.27.30 'sudo journalctl -u k3s -n 120 --no-pager'
|
||||
```
|
||||
|
||||
Check Flux and Rancher:
|
||||
|
||||
```bash
|
||||
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
|
||||
kubectl -n flux-system describe helmrelease rancher
|
||||
kubectl -n cattle-system get pods,deploy -o wide
|
||||
```
|
||||
|
||||
Check Tailscale services:
|
||||
|
||||
```bash
|
||||
kubectl -n tailscale-system get pods
|
||||
kubectl -n cattle-system get svc rancher-tailscale
|
||||
kubectl -n observability get svc grafana-tailscale prometheus-tailscale
|
||||
@@ -299,131 +271,14 @@ kubectl -n observability describe svc grafana-tailscale | grep TailscaleProxyRea
|
||||
kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxyReady
|
||||
```
|
||||
|
||||
If `TailscaleProxyReady=False`, check:
|
||||
|
||||
```bash
|
||||
kubectl -n tailscale-system logs deployment/operator --tail=100
|
||||
```
|
||||
|
||||
Common cause: OAuth client missing tag/scopes permissions.
|
||||
|
||||
### Fast dashboard iteration workflow
|
||||
|
||||
Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
|
||||
It avoids full cluster provisioning and only applies Grafana content resources:
|
||||
|
||||
- `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
|
||||
- `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
|
||||
- `ansible/dashboards.yml`
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
.
|
||||
├── terraform/
|
||||
│ ├── main.tf
|
||||
│ ├── variables.tf
|
||||
│ ├── network.tf
|
||||
│ ├── firewall.tf
|
||||
│ ├── ssh.tf
|
||||
│ ├── servers.tf
|
||||
│ ├── outputs.tf
|
||||
│ └── backend.tf
|
||||
├── ansible/
|
||||
│ ├── inventory.tmpl
|
||||
│ ├── generate_inventory.py
|
||||
│ ├── site.yml
|
||||
│ ├── roles/
|
||||
│ │ ├── common/
|
||||
│ │ ├── k3s-server/
|
||||
│ │ ├── k3s-agent/
|
||||
│ │ ├── addon-secrets-bootstrap/
|
||||
│ │ ├── observability-content/
|
||||
│ │ └── observability/
|
||||
│ └── ansible.cfg
|
||||
├── .gitea/
|
||||
│ └── workflows/
|
||||
│ ├── terraform.yml
|
||||
│ ├── ansible.yml
|
||||
│ └── dashboards.yml
|
||||
├── outputs/
|
||||
├── terraform.tfvars.example
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Firewall Rules
|
||||
|
||||
| Port | Source | Purpose |
|
||||
|------|--------|---------|
|
||||
| 22 | Tailnet CIDR | SSH |
|
||||
| 6443 | Tailnet CIDR + internal | Kubernetes API |
|
||||
| 41641/udp | Any | Tailscale WireGuard |
|
||||
| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) |
|
||||
| 2379 | 10.0.0.0/16 | etcd Client |
|
||||
| 2380 | 10.0.0.0/16 | etcd Peer |
|
||||
| 8472 | 10.0.0.0/16 | Flannel VXLAN |
|
||||
| 10250 | 10.0.0.0/16 | Kubelet |
|
||||
| 30000-32767 | Optional | NodePorts (disabled by default) |
|
||||
|
||||
## Operations
|
||||
|
||||
### Scale Workers
|
||||
|
||||
Edit `terraform.tfvars`:
|
||||
|
||||
```hcl
|
||||
worker_count = 5
|
||||
```
|
||||
|
||||
Then:
|
||||
|
||||
```bash
|
||||
terraform apply
|
||||
ansible-playbook site.yml
|
||||
```
|
||||
|
||||
### Upgrade k3s
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml -t upgrade
|
||||
```
|
||||
|
||||
### Destroy Cluster
|
||||
|
||||
```bash
|
||||
terraform destroy
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Check k3s Logs
|
||||
|
||||
```bash
|
||||
ssh root@<control-plane-ip> journalctl -u k3s -f
|
||||
```
|
||||
|
||||
### Reset k3s
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml -t reset
|
||||
```
|
||||
|
||||
## Costs Breakdown
|
||||
|
||||
| Resource | Quantity | Unit Price | Monthly |
|
||||
|----------|----------|------------|---------|
|
||||
| CX23 (Control Plane) | 3 | €2.99 | €8.97 |
|
||||
| CX33 (Workers) | 4 | €4.99 | €19.96 |
|
||||
| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 |
|
||||
| **Total** | | | **€28.93/mo** |
|
||||
If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`.
|
||||
|
||||
## Security Notes
|
||||
|
||||
- Control plane has HA (3 nodes, can survive 1 failure)
|
||||
- Consider adding Hetzner load balancer for API server
|
||||
- Rotate API tokens regularly
|
||||
- Use network policies in Kubernetes
|
||||
- Enable audit logging for production
|
||||
- Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values.
|
||||
- Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
|
||||
- Runtime cluster secrets are sourced from Doppler through External Secrets.
|
||||
- This repo does not manage Proxmox/LAN firewalls or public ingress.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
+14
-7
@@ -1,6 +1,6 @@
|
||||
# Gitea Secrets Setup
|
||||
|
||||
This document describes the secrets required for the HetznerTerra deployment workflow.
|
||||
This document describes the secrets required for the Proxmox-based deployment workflow.
|
||||
|
||||
## Required Secrets
|
||||
|
||||
@@ -9,10 +9,17 @@ Add these secrets in your Gitea repository settings:
|
||||
|
||||
### Infrastructure Secrets
|
||||
|
||||
#### `HCLOUD_TOKEN`
|
||||
- Hetzner Cloud API token
|
||||
- Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
|
||||
- Permissions: Read & Write
|
||||
#### `PROXMOX_ENDPOINT`
|
||||
- Proxmox VE API endpoint
|
||||
- Example: `https://100.105.0.115:8006/`
|
||||
|
||||
#### `PROXMOX_API_TOKEN_ID`
|
||||
- Proxmox API token ID
|
||||
- Example: `terraform-prov@pve!k8s-cluster`
|
||||
|
||||
#### `PROXMOX_API_TOKEN_SECRET`
|
||||
- Proxmox API token secret
|
||||
- Create with `pveum user token add terraform-prov@pve k8s-cluster`
|
||||
|
||||
#### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
|
||||
- Backblaze B2 credentials for Terraform state storage
|
||||
@@ -31,7 +38,7 @@ Add these secrets in your Gitea repository settings:
|
||||
|
||||
#### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
|
||||
- SSH key pair for cluster access
|
||||
- Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
|
||||
- Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra`
|
||||
- Private key content (include BEGIN/END lines)
|
||||
- Public key content (full line starting with ssh-ed25519)
|
||||
|
||||
@@ -90,4 +97,4 @@ Check the workflow logs to verify all secrets are being used correctly.
|
||||
- Prefer Doppler for runtime app/platform secrets after cluster bootstrap
|
||||
- Rotate Tailscale auth keys periodically
|
||||
- Review OAuth client permissions regularly
|
||||
- The workflow automatically opens SSH/API access only for the runner's IP during deployment
|
||||
- CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access
|
||||
|
||||
+12
-14
@@ -5,9 +5,9 @@ This document defines the current engineering target for this repository.
|
||||
## Topology
|
||||
|
||||
- 3 control planes (HA etcd cluster)
|
||||
- 3 workers
|
||||
- Hetzner Load Balancer for Kubernetes API
|
||||
- private Hetzner network
|
||||
- 5 workers
|
||||
- kube-vip API VIP (`10.27.27.40`)
|
||||
- private Proxmox/LAN network (`10.27.27.0/24`)
|
||||
- Tailscale operator access and service exposure
|
||||
- Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
|
||||
- Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
|
||||
@@ -17,11 +17,10 @@ This document defines the current engineering target for this repository.
|
||||
## In Scope
|
||||
|
||||
- Terraform infrastructure bootstrap
|
||||
- Ansible k3s bootstrap with external cloud provider
|
||||
- Ansible k3s bootstrap on Ubuntu cloud-init VMs
|
||||
- **HA control plane (3 nodes with etcd quorum)**
|
||||
- **Hetzner Load Balancer for Kubernetes API**
|
||||
- **Hetzner CCM deployed via Ansible (before workers join)**
|
||||
- **Hetzner CSI for persistent volumes (via Flux)**
|
||||
- **kube-vip for Kubernetes API HA**
|
||||
- **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`**
|
||||
- Flux core reconciliation
|
||||
- External Secrets Operator with Doppler
|
||||
- Tailscale private access and smoke-check validation
|
||||
@@ -45,15 +44,14 @@ This document defines the current engineering target for this repository.
|
||||
|
||||
## Phase Gates
|
||||
|
||||
1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
|
||||
2. Load Balancer is healthy with all 3 control plane targets.
|
||||
3. Primary control plane bootstraps with `--cluster-init`.
|
||||
4. Secondary control planes join via Load Balancer endpoint.
|
||||
5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
|
||||
6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
|
||||
1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP).
|
||||
2. Primary control plane bootstraps with `--cluster-init`.
|
||||
3. kube-vip advertises `10.27.27.40:6443` from the control-plane set.
|
||||
4. Secondary control planes join via the kube-vip endpoint.
|
||||
5. Workers join successfully via the kube-vip endpoint.
|
||||
7. etcd reports 3 healthy members.
|
||||
8. Flux source and infrastructure reconciliation are healthy.
|
||||
9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
|
||||
9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**.
|
||||
10. **PVC provisioning tested and working**.
|
||||
11. External Secrets sync required secrets.
|
||||
12. Tailscale private access works for Rancher, Grafana, and Prometheus.
|
||||
|
||||
+2
-1
@@ -3,7 +3,8 @@ inventory = inventory.ini
|
||||
host_key_checking = False
|
||||
retry_files_enabled = False
|
||||
roles_path = roles
|
||||
stdout_callback = yaml
|
||||
stdout_callback = default
|
||||
result_format = yaml
|
||||
interpreter_python = auto_silent
|
||||
|
||||
[privilege_escalation]
|
||||
|
||||
@@ -13,8 +13,7 @@ control_plane
|
||||
workers
|
||||
|
||||
[cluster:vars]
|
||||
ansible_user=root
|
||||
ansible_user=ubuntu
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
ansible_ssh_private_key_file={{ private_key_file }}
|
||||
k3s_version=latest
|
||||
kube_api_endpoint={{ kube_api_lb_ip }}
|
||||
|
||||
@@ -1,14 +1,4 @@
|
||||
---
|
||||
- name: Apply Hetzner cloud secret
|
||||
shell: >-
|
||||
kubectl -n kube-system create secret generic hcloud
|
||||
--from-literal=token='{{ hcloud_token }}'
|
||||
--from-literal=network='{{ cluster_name }}-network'
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
changed_when: true
|
||||
no_log: true
|
||||
when: hcloud_token | default('') | length > 0
|
||||
|
||||
- name: Ensure Tailscale operator namespace exists
|
||||
command: >-
|
||||
kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
---
|
||||
bootstrap_prepull_images:
|
||||
- docker.io/rancher/mirrored-pause:3.6
|
||||
- docker.io/rancher/mirrored-coredns-coredns:1.14.2
|
||||
- docker.io/rancher/mirrored-metrics-server:v0.8.1
|
||||
- docker.io/rancher/local-path-provisioner:v0.0.35
|
||||
- docker.io/rancher/mirrored-library-traefik:3.6.10
|
||||
- docker.io/rancher/klipper-helm:v0.9.14-build20260309
|
||||
- ghcr.io/fluxcd/source-controller:v1.8.0
|
||||
- ghcr.io/fluxcd/kustomize-controller:v1.8.1
|
||||
- ghcr.io/fluxcd/helm-controller:v1.5.1
|
||||
- ghcr.io/fluxcd/notification-controller:v1.8.1
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Check for runner-provided bootstrap image archives
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: bootstrap_image_archive_stats
|
||||
loop: "{{ bootstrap_prepull_images }}"
|
||||
|
||||
- name: Ensure remote bootstrap image archive directory exists
|
||||
file:
|
||||
path: /tmp/bootstrap-image-archives
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: Copy runner-provided bootstrap image archives
|
||||
copy:
|
||||
src: "{{ item.stat.path }}"
|
||||
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
|
||||
mode: "0644"
|
||||
loop: "{{ bootstrap_image_archive_stats.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.stat.exists
|
||||
|
||||
- name: Import or pull bootstrap images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
if [ -s "${archive}" ]; then
|
||||
for attempt in 1 2 3; do
|
||||
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "imported image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: bootstrap_image_pull
|
||||
loop: "{{ bootstrap_prepull_images }}"
|
||||
changed_when: "'imported image' in bootstrap_image_pull.stdout or 'pulled image' in bootstrap_image_pull.stdout"
|
||||
@@ -1,82 +0,0 @@
|
||||
---
|
||||
- name: Check if hcloud secret exists
|
||||
command: kubectl -n kube-system get secret hcloud
|
||||
register: hcloud_secret_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail if hcloud secret is missing
|
||||
fail:
|
||||
msg: "hcloud secret not found in kube-system namespace. CCM requires it."
|
||||
when: hcloud_secret_check.rc != 0
|
||||
|
||||
- name: Check if helm is installed
|
||||
command: which helm
|
||||
register: helm_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Install helm
|
||||
when: helm_check.rc != 0
|
||||
block:
|
||||
- name: Download helm install script
|
||||
get_url:
|
||||
url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
|
||||
dest: /tmp/get-helm-3.sh
|
||||
mode: "0755"
|
||||
|
||||
- name: Run helm install script
|
||||
command: /tmp/get-helm-3.sh
|
||||
args:
|
||||
creates: /usr/local/bin/helm
|
||||
|
||||
- name: Add Hetzner Helm repository
|
||||
kubernetes.core.helm_repository:
|
||||
name: hcloud
|
||||
repo_url: https://charts.hetzner.cloud
|
||||
kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
environment:
|
||||
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
|
||||
|
||||
- name: Deploy Hetzner Cloud Controller Manager
|
||||
kubernetes.core.helm:
|
||||
name: hcloud-cloud-controller-manager
|
||||
chart_ref: hcloud/hcloud-cloud-controller-manager
|
||||
release_namespace: kube-system
|
||||
create_namespace: true
|
||||
values:
|
||||
networking:
|
||||
enabled: true
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: "{{ inventory_hostname }}"
|
||||
additionalTolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
wait: true
|
||||
wait_timeout: 300s
|
||||
environment:
|
||||
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
|
||||
|
||||
- name: Wait for CCM to be ready
|
||||
command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
|
||||
changed_when: false
|
||||
register: ccm_rollout
|
||||
until: ccm_rollout.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: Pause to ensure CCM is fully ready to process new nodes
|
||||
pause:
|
||||
seconds: 10
|
||||
|
||||
- name: Verify CCM is removing uninitialized taints
|
||||
command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
|
||||
register: uninitialized_taints
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display taint status
|
||||
debug:
|
||||
msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
|
||||
@@ -1,12 +1,32 @@
|
||||
---
|
||||
- name: Check if cloud-init is installed
|
||||
command: which cloud-init
|
||||
register: cloud_init_binary
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Wait for cloud-init to finish first-boot tasks
|
||||
command: cloud-init status --wait
|
||||
register: cloud_init_wait
|
||||
changed_when: false
|
||||
failed_when: >-
|
||||
cloud_init_wait.rc not in [0, 2] or
|
||||
(
|
||||
'status: done' not in cloud_init_wait.stdout and
|
||||
'status: disabled' not in cloud_init_wait.stdout
|
||||
)
|
||||
when: cloud_init_binary.rc == 0
|
||||
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
lock_timeout: 600
|
||||
|
||||
- name: Upgrade packages
|
||||
apt:
|
||||
upgrade: dist
|
||||
lock_timeout: 600
|
||||
when: common_upgrade_packages | default(false)
|
||||
|
||||
- name: Install required packages
|
||||
@@ -19,18 +39,27 @@
|
||||
- lsb-release
|
||||
- software-properties-common
|
||||
- jq
|
||||
- nfs-common
|
||||
- htop
|
||||
- vim
|
||||
state: present
|
||||
lock_timeout: 600
|
||||
|
||||
- name: Check active swap
|
||||
command: swapon --noheadings
|
||||
register: active_swap
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Disable swap
|
||||
command: swapoff -a
|
||||
changed_when: true
|
||||
when: active_swap.stdout | trim | length > 0
|
||||
|
||||
- name: Remove swap from fstab
|
||||
mount:
|
||||
name: swap
|
||||
fstype: swap
|
||||
lineinfile:
|
||||
path: /etc/fstab
|
||||
regexp: '^\s*[^#]\S+\s+\S+\s+swap\s+.*$'
|
||||
state: absent
|
||||
|
||||
- name: Load br_netfilter module
|
||||
@@ -66,6 +95,10 @@
|
||||
|
||||
- name: Install tailscale
|
||||
shell: curl -fsSL https://tailscale.com/install.sh | sh
|
||||
register: tailscale_install
|
||||
until: tailscale_install.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_binary.rc != 0
|
||||
@@ -78,9 +111,22 @@
|
||||
failed_when: false
|
||||
when: tailscale_auth_key | length > 0
|
||||
|
||||
- name: Connect node to tailnet
|
||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||
- name: Parse tailscale connection state
|
||||
set_fact:
|
||||
tailscale_backend_state: "{{ (tailscale_status.stdout | from_json).BackendState | default('') }}"
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_status.rc != 0 or '"BackendState":"Running"' not in tailscale_status.stdout
|
||||
- tailscale_status.rc == 0
|
||||
- tailscale_status.stdout | length > 0
|
||||
|
||||
- name: Connect node to tailnet
|
||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||
register: tailscale_up
|
||||
until: tailscale_up.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
no_log: true
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
|
||||
changed_when: true
|
||||
|
||||
@@ -15,36 +15,10 @@
|
||||
--from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
changed_when: true
|
||||
|
||||
- name: Check for ClusterSecretStore CRD
|
||||
command: kubectl get crd clustersecretstores.external-secrets.io
|
||||
register: doppler_clustersecretstore_crd
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Apply Doppler ClusterSecretStore
|
||||
shell: |
|
||||
cat <<'EOF' | kubectl apply -f -
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
metadata:
|
||||
name: doppler-hetznerterra
|
||||
spec:
|
||||
provider:
|
||||
doppler:
|
||||
auth:
|
||||
secretRef:
|
||||
dopplerToken:
|
||||
name: doppler-hetznerterra-service-token
|
||||
key: dopplerToken
|
||||
namespace: external-secrets
|
||||
EOF
|
||||
changed_when: true
|
||||
when: doppler_clustersecretstore_crd.rc == 0
|
||||
no_log: true
|
||||
|
||||
- name: Note pending Doppler ClusterSecretStore bootstrap
|
||||
debug:
|
||||
msg: >-
|
||||
Skipping Doppler ClusterSecretStore bootstrap because the External Secrets CRD
|
||||
is not available yet. Re-run after External Secrets is installed.
|
||||
when: doppler_clustersecretstore_crd.rc != 0
|
||||
Doppler service token secret is bootstrapped. The deploy workflow creates the
|
||||
ClusterSecretStore after External Secrets CRDs and webhook endpoints are ready.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
---
|
||||
k3s_version: latest
|
||||
k3s_version: v1.34.6+k3s1
|
||||
k3s_server_url: ""
|
||||
k3s_token: ""
|
||||
k3s_node_ip: ""
|
||||
k3s_kubelet_cloud_provider_external: true
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
|
||||
@@ -1,19 +1,53 @@
|
||||
---
|
||||
- name: Check if k3s agent is already installed
|
||||
- name: Check if k3s agent service exists
|
||||
stat:
|
||||
path: /usr/local/bin/k3s-agent
|
||||
register: k3s_agent_binary
|
||||
path: /etc/systemd/system/k3s-agent.service
|
||||
register: k3s_agent_service
|
||||
|
||||
- name: Check k3s agent service state
|
||||
command: systemctl is-active k3s-agent
|
||||
register: k3s_agent_service_state
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_agent_service.stat.exists
|
||||
|
||||
- name: Check installed k3s version
|
||||
command: k3s --version
|
||||
register: installed_k3s_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_agent_service.stat.exists
|
||||
|
||||
- name: Determine whether k3s agent install is needed
|
||||
set_fact:
|
||||
k3s_agent_install_needed: >-
|
||||
{{
|
||||
(not k3s_agent_service.stat.exists)
|
||||
or ((k3s_agent_service_state.stdout | default('')) != 'active')
|
||||
or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
|
||||
}}
|
||||
|
||||
- name: Download k3s install script
|
||||
get_url:
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
when: not k3s_agent_binary.stat.exists
|
||||
register: k3s_agent_install_script
|
||||
until: k3s_agent_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_agent_install_needed
|
||||
|
||||
- name: Install k3s agent
|
||||
when: not k3s_agent_binary.stat.exists
|
||||
when: k3s_agent_install_needed
|
||||
block:
|
||||
- name: Wait for Kubernetes API endpoint before agent join
|
||||
wait_for:
|
||||
host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 180
|
||||
|
||||
- name: Run k3s agent install
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
@@ -22,32 +56,12 @@
|
||||
command: >-
|
||||
/tmp/install-k3s.sh agent
|
||||
--node-ip {{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
args:
|
||||
creates: /usr/local/bin/k3s-agent
|
||||
rescue:
|
||||
- name: Show k3s-agent service status after failed install
|
||||
command: systemctl status k3s-agent --no-pager
|
||||
register: k3s_agent_status_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Show recent k3s-agent logs after failed install
|
||||
command: journalctl -u k3s-agent -n 120 --no-pager
|
||||
register: k3s_agent_journal_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail with k3s-agent diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
k3s agent install failed on {{ inventory_hostname }}.
|
||||
Service status:
|
||||
{{ k3s_agent_status_after_install.stdout | default('n/a') }}
|
||||
|
||||
Recent logs:
|
||||
{{ k3s_agent_journal_after_install.stdout | default('n/a') }}
|
||||
register: k3s_agent_install
|
||||
until: k3s_agent_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s agent to be ready
|
||||
command: systemctl is-active k3s-agent
|
||||
@@ -56,3 +70,34 @@
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Show k3s-agent service status on failure
|
||||
command: systemctl status k3s-agent --no-pager
|
||||
register: k3s_agent_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: agent_status is failed
|
||||
|
||||
- name: Show recent k3s-agent logs on failure
|
||||
command: journalctl -u k3s-agent -n 120 --no-pager
|
||||
register: k3s_agent_journal
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: agent_status is failed
|
||||
|
||||
- name: Fail with k3s-agent diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
k3s agent failed to become ready on {{ inventory_hostname }}.
|
||||
Install stdout:
|
||||
{{ k3s_agent_install.stdout | default('n/a') }}
|
||||
|
||||
Install stderr:
|
||||
{{ k3s_agent_install.stderr | default('n/a') }}
|
||||
|
||||
Service status:
|
||||
{{ k3s_agent_status.stdout | default('n/a') }}
|
||||
|
||||
Recent logs:
|
||||
{{ k3s_agent_journal.stdout | default('n/a') }}
|
||||
when: agent_status is failed
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
---
|
||||
k3s_version: latest
|
||||
k3s_version: v1.34.6+k3s1
|
||||
k3s_token: ""
|
||||
k3s_node_ip: ""
|
||||
k3s_primary_public_ip: ""
|
||||
k3s_disable_embedded_ccm: true
|
||||
k3s_disable_embedded_ccm: false
|
||||
k3s_disable_servicelb: true
|
||||
k3s_kubelet_cloud_provider_external: true
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
# Load Balancer endpoint for HA cluster joins (set in inventory)
|
||||
kube_api_endpoint: ""
|
||||
# Tailscale DNS names for control planes (to enable tailnet access)
|
||||
|
||||
@@ -11,9 +11,21 @@
|
||||
failed_when: false
|
||||
when: k3s_service.stat.exists
|
||||
|
||||
- name: Check installed k3s version
|
||||
command: k3s --version
|
||||
register: installed_k3s_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_service.stat.exists
|
||||
|
||||
- name: Determine whether k3s install is needed
|
||||
set_fact:
|
||||
k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
|
||||
k3s_install_needed: >-
|
||||
{{
|
||||
(not k3s_service.stat.exists)
|
||||
or ((k3s_service_state.stdout | default('')) != 'active')
|
||||
or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
|
||||
}}
|
||||
|
||||
- name: Wait for API endpoint on 6443 (secondary only)
|
||||
wait_for:
|
||||
@@ -50,6 +62,10 @@
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
register: k3s_install_script
|
||||
until: k3s_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_install_needed
|
||||
|
||||
- name: Install k3s server (primary)
|
||||
@@ -61,7 +77,7 @@
|
||||
--cluster-init
|
||||
--advertise-address={{ k3s_primary_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
--tls-san={{ k3s_primary_ip }}
|
||||
--tls-san={{ k3s_primary_public_ip }}
|
||||
--tls-san={{ kube_api_endpoint }}
|
||||
@@ -69,6 +85,10 @@
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: primary_install
|
||||
until: primary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
when:
|
||||
- k3s_install_needed
|
||||
- k3s_primary | default(false)
|
||||
@@ -87,40 +107,14 @@
|
||||
--server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
|
||||
--advertise-address={{ k3s_node_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface=enp7s0
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: secondary_install
|
||||
|
||||
rescue:
|
||||
- name: Show k3s service status after failed secondary install
|
||||
command: systemctl status k3s --no-pager
|
||||
register: k3s_status_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Show recent k3s logs after failed secondary install
|
||||
command: journalctl -u k3s -n 120 --no-pager
|
||||
register: k3s_journal_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail with secondary install diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
Secondary k3s install failed on {{ inventory_hostname }}.
|
||||
Install stdout:
|
||||
{{ secondary_install.stdout | default('n/a') }}
|
||||
|
||||
Install stderr:
|
||||
{{ secondary_install.stderr | default('n/a') }}
|
||||
|
||||
Service status:
|
||||
{{ k3s_status_after_install.stdout | default('n/a') }}
|
||||
|
||||
Recent logs:
|
||||
{{ k3s_journal_after_install.stdout | default('n/a') }}
|
||||
until: secondary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s to be ready
|
||||
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
kube_vip_version: v1.1.2
|
||||
kube_vip_interface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
kube_vip_address: "{{ kube_api_endpoint }}"
|
||||
kube_vip_prepull_images:
|
||||
- docker.io/rancher/mirrored-pause:3.6
|
||||
- ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
- name: Check for runner-provided kube-vip image archive
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: kube_vip_bootstrap_archive
|
||||
|
||||
- name: Copy runner-provided kube-vip image archive
|
||||
copy:
|
||||
src: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
|
||||
dest: /tmp/kube-vip-bootstrap.tar
|
||||
mode: "0644"
|
||||
when: kube_vip_bootstrap_archive.stat.exists
|
||||
|
||||
- name: Import runner-provided kube-vip image archive
|
||||
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
||||
changed_when: false
|
||||
when: kube_vip_bootstrap_archive.stat.exists
|
||||
|
||||
- name: Pre-pull kube-vip bootstrap images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if timeout 120s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: kube_vip_image_pull
|
||||
loop: "{{ kube_vip_prepull_images }}"
|
||||
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
|
||||
|
||||
- name: Render kube-vip control plane manifest
|
||||
template:
|
||||
src: kube-vip-control-plane.yaml.j2
|
||||
dest: /tmp/kube-vip-control-plane.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply kube-vip control plane manifest
|
||||
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
|
||||
register: kube_vip_apply
|
||||
until: kube_vip_apply.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for local kube-vip pod to be ready
|
||||
shell: >-
|
||||
kubectl -n kube-system get pods
|
||||
-l app.kubernetes.io/name=kube-vip
|
||||
--field-selector spec.nodeName={{ inventory_hostname }}
|
||||
-o jsonpath='{.items[0].status.conditions[?(@.type=="Ready")].status}'
|
||||
register: kube_vip_pod_ready
|
||||
changed_when: false
|
||||
until: kube_vip_pod_ready.stdout == "True"
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Show kube-vip pod status on failure
|
||||
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
|
||||
register: kube_vip_pods
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Describe kube-vip pod on failure
|
||||
shell: >-
|
||||
kubectl -n kube-system describe pod
|
||||
$(kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip --field-selector spec.nodeName={{ inventory_hostname }} -o jsonpath='{.items[0].metadata.name}')
|
||||
register: kube_vip_pod_describe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Fail with kube-vip diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
kube-vip failed to become ready on {{ inventory_hostname }}.
|
||||
Pods:
|
||||
{{ kube_vip_pods.stdout | default('n/a') }}
|
||||
|
||||
Describe:
|
||||
{{ kube_vip_pod_describe.stdout | default('n/a') }}
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Wait for API VIP on 6443
|
||||
wait_for:
|
||||
host: "{{ kube_vip_address }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 180
|
||||
@@ -0,0 +1,110 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: system:kube-vip-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services/status"]
|
||||
verbs: ["update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints"]
|
||||
verbs: ["list", "get", "watch", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["list", "get", "watch", "update", "patch"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: ["leases"]
|
||||
verbs: ["list", "get", "watch", "update", "create"]
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: ["endpointslices"]
|
||||
verbs: ["list", "get", "watch", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: system:kube-vip-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:kube-vip-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kube-vip
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-vip
|
||||
spec:
|
||||
serviceAccountName: kube-vip
|
||||
hostNetwork: true
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: kube-vip
|
||||
image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- manager
|
||||
env:
|
||||
- name: vip_arp
|
||||
value: "true"
|
||||
- name: port
|
||||
value: "6443"
|
||||
- name: vip_interface
|
||||
value: {{ kube_vip_interface | quote }}
|
||||
- name: vip_subnet
|
||||
value: "32"
|
||||
- name: cp_enable
|
||||
value: "true"
|
||||
- name: cp_namespace
|
||||
value: kube-system
|
||||
- name: vip_ddns
|
||||
value: "false"
|
||||
- name: vip_leaderelection
|
||||
value: "true"
|
||||
- name: vip_leaseduration
|
||||
value: "5"
|
||||
- name: vip_renewdeadline
|
||||
value: "3"
|
||||
- name: vip_retryperiod
|
||||
value: "1"
|
||||
- name: address
|
||||
value: {{ kube_vip_address | quote }}
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- SYS_TIME
|
||||
@@ -105,6 +105,11 @@
|
||||
register: grafana_loki_labels
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
until: >-
|
||||
grafana_loki_labels.rc != 0 or
|
||||
'"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: loki_enabled
|
||||
|
||||
- name: Fail when Loki is reachable but has zero indexed labels
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
rancher_images_to_prepull:
|
||||
- docker.io/rancher/rancher:v2.13.3
|
||||
- docker.io/rancher/rancher-webhook:v0.9.3
|
||||
- docker.io/rancher/system-upgrade-controller:v0.17.0
|
||||
- docker.io/rancher/shell:v0.6.2
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Check for runner-provided Rancher image archives
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: rancher_image_archive_stats
|
||||
loop: "{{ rancher_images_to_prepull }}"
|
||||
|
||||
- name: Ensure remote Rancher image archive directory exists
|
||||
file:
|
||||
path: /tmp/bootstrap-image-archives
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: Copy runner-provided Rancher image archives
|
||||
copy:
|
||||
src: "{{ item.stat.path }}"
|
||||
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
|
||||
mode: "0644"
|
||||
loop: "{{ rancher_image_archive_stats.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.stat.exists
|
||||
|
||||
- name: Import or pull Rancher images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
if [ -s "${archive}" ]; then
|
||||
for attempt in 1 2 3; do
|
||||
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "imported image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: rancher_image_pull
|
||||
loop: "{{ rancher_images_to_prepull }}"
|
||||
changed_when: "'imported image' in rancher_image_pull.stdout or 'pulled image' in rancher_image_pull.stdout"
|
||||
@@ -9,22 +9,26 @@
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
return_content: true
|
||||
register: ts_devices
|
||||
until: ts_devices.status == 200
|
||||
retries: 5
|
||||
delay: 10
|
||||
|
||||
- name: Find stale devices matching reserved hostnames
|
||||
set_fact:
|
||||
stale_devices: >-
|
||||
{{ ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| rejectattr('online', 'defined')
|
||||
| list
|
||||
+
|
||||
ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('online', 'defined')
|
||||
| rejectattr('online', 'equalto', true)
|
||||
| list }}
|
||||
{{ (ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('connectedToControl', 'defined')
|
||||
| rejectattr('connectedToControl', 'equalto', true)
|
||||
| list
|
||||
+
|
||||
ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('online', 'defined')
|
||||
| rejectattr('online', 'equalto', true)
|
||||
| list) | unique(attribute='id') | list }}
|
||||
|
||||
- name: Delete stale devices
|
||||
uri:
|
||||
@@ -33,6 +37,10 @@
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
status_code: 200
|
||||
register: ts_delete_device
|
||||
until: ts_delete_device.status == 200
|
||||
retries: 3
|
||||
delay: 5
|
||||
loop: "{{ stale_devices }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }} ({{ item.id }})"
|
||||
|
||||
+107
-4
@@ -1,14 +1,26 @@
|
||||
---
|
||||
- name: Clean up stale Tailscale cluster node devices
|
||||
hosts: localhost
|
||||
connection: local
|
||||
vars:
|
||||
tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}"
|
||||
|
||||
roles:
|
||||
- tailscale-cleanup
|
||||
|
||||
- name: Bootstrap Kubernetes cluster
|
||||
hosts: cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
gather_facts: false
|
||||
|
||||
pre_tasks:
|
||||
- name: Wait for SSH
|
||||
wait_for_connection:
|
||||
delay: 10
|
||||
timeout: 300
|
||||
timeout: 600
|
||||
|
||||
- name: Gather facts after SSH is reachable
|
||||
setup:
|
||||
|
||||
roles:
|
||||
- common
|
||||
@@ -57,12 +69,24 @@
|
||||
roles:
|
||||
- addon-secrets-bootstrap
|
||||
|
||||
- name: Deploy Hetzner CCM (required for workers with external cloud provider)
|
||||
- name: Deploy kube-vip for API HA
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- ccm-deploy
|
||||
- kube-vip-deploy
|
||||
|
||||
- name: Wait for Kubernetes API VIP readiness
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
tasks:
|
||||
- name: Wait for Kubernetes readyz through the VIP
|
||||
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||
register: api_readyz
|
||||
until: api_readyz.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Setup secondary control planes
|
||||
hosts: control_plane[1:]
|
||||
@@ -80,6 +104,64 @@
|
||||
roles:
|
||||
- k3s-server
|
||||
|
||||
- name: Export kube-vip image from primary control plane
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
tasks:
|
||||
- name: Export kube-vip image for secondary control planes
|
||||
command: >-
|
||||
/usr/local/bin/ctr -n k8s.io images export
|
||||
/tmp/kube-vip-bootstrap.tar
|
||||
ghcr.io/kube-vip/kube-vip:v1.1.2
|
||||
changed_when: false
|
||||
|
||||
- name: Fetch kube-vip image archive
|
||||
fetch:
|
||||
src: /tmp/kube-vip-bootstrap.tar
|
||||
dest: ../outputs/kube-vip-bootstrap.tar
|
||||
flat: true
|
||||
|
||||
- name: Seed kube-vip image on secondary control planes
|
||||
hosts: control_plane[1:]
|
||||
become: true
|
||||
|
||||
tasks:
|
||||
- name: Copy kube-vip image archive
|
||||
copy:
|
||||
src: ../outputs/kube-vip-bootstrap.tar
|
||||
dest: /tmp/kube-vip-bootstrap.tar
|
||||
mode: "0644"
|
||||
|
||||
- name: Import kube-vip image into containerd
|
||||
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
||||
register: kube_vip_secondary_import
|
||||
until: kube_vip_secondary_import.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for all control plane nodes to be Ready
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
tasks:
|
||||
- name: Wait for control plane node readiness
|
||||
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
||||
register: control_plane_ready
|
||||
until: control_plane_ready.rc == 0
|
||||
retries: 20
|
||||
delay: 15
|
||||
changed_when: false
|
||||
loop: "{{ groups['control_plane'] }}"
|
||||
|
||||
- name: Wait for Kubernetes readyz before worker joins
|
||||
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||
register: api_readyz_before_workers
|
||||
until: api_readyz_before_workers.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Setup workers
|
||||
hosts: workers
|
||||
become: true
|
||||
@@ -93,6 +175,21 @@
|
||||
roles:
|
||||
- k3s-agent
|
||||
|
||||
- name: Pre-pull bootstrap control-plane images
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- bootstrap-image-prepull
|
||||
|
||||
- name: Pre-pull Rancher bootstrap images
|
||||
hosts: workers
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: rancher-image-prepull
|
||||
when: rancher_image_prepull_enabled | default(false) | bool
|
||||
|
||||
- name: Deploy observability stack
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
@@ -148,10 +245,16 @@
|
||||
hosts: localhost
|
||||
connection: local
|
||||
tasks:
|
||||
- name: Check whether kubeconfig was fetched
|
||||
stat:
|
||||
path: ../outputs/kubeconfig
|
||||
register: kubeconfig_file
|
||||
|
||||
- name: Update kubeconfig server address
|
||||
command: |
|
||||
sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
|
||||
changed_when: true
|
||||
when: kubeconfig_file.stat.exists
|
||||
|
||||
- name: Display success message
|
||||
debug:
|
||||
|
||||
@@ -8,6 +8,10 @@ spec:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -19,6 +23,10 @@ spec:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -30,6 +38,10 @@ spec:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -41,3 +53,7 @@ spec:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: hcloud-cloud-controller-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: kube-system
|
||||
chart:
|
||||
spec:
|
||||
chart: hcloud-cloud-controller-manager
|
||||
version: 1.30.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: hcloud
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
selectorLabels:
|
||||
app: hcloud-cloud-controller-manager
|
||||
args:
|
||||
secure-port: "0"
|
||||
networking:
|
||||
enabled: true
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
additionalTolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: hcloud
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.hetzner.cloud
|
||||
@@ -1,5 +0,0 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrepository-hcloud.yaml
|
||||
- helmrelease-hcloud-ccm.yaml
|
||||
@@ -5,14 +5,14 @@ metadata:
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 15m
|
||||
targetNamespace: cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: cert-manager
|
||||
version: "v1.17.2"
|
||||
chart: ./infrastructure/charts/cert-manager
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: jetstack
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: jetstack
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.jetstack.io
|
||||
@@ -2,5 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-cert-manager.yaml
|
||||
- helmrelease-cert-manager.yaml
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: hcloud-csi
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: kube-system
|
||||
chart:
|
||||
spec:
|
||||
chart: hcloud-csi
|
||||
version: 2.20.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: hcloud
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
controller:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
hcloudVolumeDefaultLocation: nbg1
|
||||
storageClasses:
|
||||
- name: hcloud-volumes
|
||||
defaultStorageClass: true
|
||||
reclaimPolicy: Delete
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: hcloud
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.hetzner.cloud
|
||||
+1
-2
@@ -1,5 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- backup-recurring.yaml
|
||||
- restore-from-b2.yaml
|
||||
- clustersecretstore-doppler-hetznerterra.yaml
|
||||
@@ -6,14 +6,10 @@ metadata:
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: external-secrets
|
||||
chart:
|
||||
spec:
|
||||
chart: external-secrets
|
||||
version: 2.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
@@ -23,13 +19,25 @@ spec:
|
||||
retries: 3
|
||||
values:
|
||||
installCRDs: true
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
webhook:
|
||||
failurePolicy: Ignore
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
certController:
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
serviceMonitor:
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.external-secrets.io
|
||||
@@ -2,5 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-external-secrets.yaml
|
||||
- ocirepository-external-secrets.yaml
|
||||
- helmrelease-external-secrets.yaml
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/external-secrets/charts/external-secrets
|
||||
ref:
|
||||
tag: 2.1.0
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -1,15 +0,0 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-ccm
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/ccm
|
||||
wait: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -11,5 +11,5 @@ spec:
|
||||
name: platform
|
||||
path: ./infrastructure/addons/cert-manager
|
||||
wait: true
|
||||
timeout: 10m
|
||||
timeout: 20m
|
||||
suspend: false
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-csi
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/csi
|
||||
dependsOn:
|
||||
- name: addon-ccm
|
||||
wait: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-external-secrets-store
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/external-secrets-store
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
name: doppler-hetznerterra
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -10,6 +10,19 @@ spec:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/external-secrets
|
||||
wait: true
|
||||
timeout: 5m
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: external-secrets-external-secrets
|
||||
namespace: external-secrets
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: external-secrets-external-secrets-webhook
|
||||
namespace: external-secrets
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
|
||||
+7
-5
@@ -1,7 +1,7 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-backup
|
||||
name: addon-nfs-storage
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
@@ -9,10 +9,12 @@ spec:
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-backup
|
||||
path: ./infrastructure/addons/nfs-storage
|
||||
wait: true
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
- name: addon-rancher
|
||||
@@ -0,0 +1,26 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability-secrets
|
||||
dependsOn:
|
||||
- name: addon-external-secrets-store
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: grafana-admin
|
||||
namespace: observability
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: grafana-admin-credentials
|
||||
namespace: observability
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -11,9 +11,23 @@ spec:
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
- name: addon-observability-secrets
|
||||
- name: addon-nfs-storage
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
wait: true
|
||||
timeout: 5m
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: kube-prometheus-stack
|
||||
namespace: flux-system
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
timeout: 30m
|
||||
suspend: false
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-backup-config
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-backup-config
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-rancher-backup
|
||||
@@ -13,5 +13,5 @@ spec:
|
||||
dependsOn:
|
||||
- name: addon-rancher
|
||||
wait: true
|
||||
timeout: 5m
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-secrets
|
||||
dependsOn:
|
||||
- name: addon-external-secrets-store
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: flux-system
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: flux-system
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: cattle-system
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: cattle-system
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -10,11 +10,32 @@ spec:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher
|
||||
wait: true
|
||||
timeout: 15m
|
||||
timeout: 30m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
- name: addon-external-secrets
|
||||
- name: addon-rancher-secrets
|
||||
- name: addon-cert-manager
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: rancher
|
||||
namespace: flux-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: cattle-system-rancher
|
||||
namespace: cattle-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: rancher-webhook
|
||||
namespace: cattle-system
|
||||
- apiVersion: cert-manager.io/v1
|
||||
kind: Issuer
|
||||
name: cattle-system-rancher
|
||||
namespace: cattle-system
|
||||
- apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
name: tls-rancher-ingress
|
||||
namespace: cattle-system
|
||||
|
||||
@@ -10,6 +10,6 @@ spec:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/tailscale-operator
|
||||
wait: true
|
||||
timeout: 5m
|
||||
wait: false
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- kustomization-ccm.yaml
|
||||
- kustomization-csi.yaml
|
||||
- kustomization-nfs-storage.yaml
|
||||
- kustomization-external-secrets.yaml
|
||||
- kustomization-external-secrets-store.yaml
|
||||
- kustomization-cert-manager.yaml
|
||||
- kustomization-tailscale-operator.yaml
|
||||
- kustomization-tailscale-proxyclass.yaml
|
||||
- traefik
|
||||
- kustomization-observability-secrets.yaml
|
||||
- kustomization-observability.yaml
|
||||
- kustomization-observability-content.yaml
|
||||
- kustomization-rancher-secrets.yaml
|
||||
- kustomization-rancher.yaml
|
||||
- kustomization-rancher-config.yaml
|
||||
- kustomization-rancher-backup.yaml
|
||||
- kustomization-rancher-backup-config.yaml
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner-runner
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumes"]
|
||||
verbs: ["get", "list", "watch", "create", "delete"]
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumeclaims"]
|
||||
verbs: ["get", "list", "watch", "update"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["create", "update", "patch"]
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: run-nfs-subdir-external-provisioner
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: nfs-subdir-external-provisioner-runner
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,41 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nfs-subdir-external-provisioner
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nfs-subdir-external-provisioner
|
||||
spec:
|
||||
serviceAccountName: nfs-subdir-external-provisioner
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: nfs-subdir-external-provisioner
|
||||
image: registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: PROVISIONER_NAME
|
||||
value: flash-nfs
|
||||
- name: NFS_SERVER
|
||||
value: 10.27.27.239
|
||||
- name: NFS_PATH
|
||||
value: /TheFlash/k8s-nfs
|
||||
volumeMounts:
|
||||
- name: nfs-subdir-external-provisioner-root
|
||||
mountPath: /persistentvolumes
|
||||
volumes:
|
||||
- name: nfs-subdir-external-provisioner-root
|
||||
nfs:
|
||||
server: 10.27.27.239
|
||||
path: /TheFlash/k8s-nfs
|
||||
@@ -0,0 +1,10 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- serviceaccount-nfs-subdir-external-provisioner.yaml
|
||||
- clusterrole-nfs-subdir-external-provisioner.yaml
|
||||
- clusterrolebinding-nfs-subdir-external-provisioner.yaml
|
||||
- role-nfs-subdir-external-provisioner.yaml
|
||||
- rolebinding-nfs-subdir-external-provisioner.yaml
|
||||
- storageclass-flash-nfs.yaml
|
||||
- deployment-nfs-subdir-external-provisioner.yaml
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch"]
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: flash-nfs
|
||||
annotations:
|
||||
storageclass.kubernetes.io/is-default-class: "true"
|
||||
provisioner: flash-nfs
|
||||
parameters:
|
||||
archiveOnDelete: "true"
|
||||
reclaimPolicy: Delete
|
||||
allowVolumeExpansion: true
|
||||
volumeBindingMode: Immediate
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrepository-hcloud.yaml
|
||||
- helmrelease-hcloud-csi.yaml
|
||||
- namespace.yaml
|
||||
- grafana-admin-externalsecret.yaml
|
||||
@@ -5,14 +5,14 @@ metadata:
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 15m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: kube-prometheus-stack
|
||||
version: 68.4.4
|
||||
chart: ./infrastructure/charts/kube-prometheus-stack
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: prometheus-community
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
@@ -21,6 +21,7 @@ spec:
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
strategy: uninstall
|
||||
values:
|
||||
grafana:
|
||||
enabled: true
|
||||
|
||||
@@ -6,14 +6,10 @@ metadata:
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: loki
|
||||
version: 6.10.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
@@ -50,7 +46,7 @@ spec:
|
||||
replicas: 1
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: local-path
|
||||
storageClass: flash-nfs
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
@@ -87,11 +83,11 @@ spec:
|
||||
test:
|
||||
enabled: false
|
||||
chunksCache:
|
||||
enabled: true
|
||||
allocatedMemory: 128
|
||||
enabled: false
|
||||
resultsCache:
|
||||
enabled: true
|
||||
allocatedMemory: 128
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
monitoring:
|
||||
selfMonitoring:
|
||||
enabled: false
|
||||
|
||||
@@ -5,15 +5,12 @@ metadata:
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 20m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: promtail
|
||||
version: 6.16.6
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
@@ -22,6 +19,8 @@ spec:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
image:
|
||||
pullPolicy: IfNotPresent
|
||||
config:
|
||||
clients:
|
||||
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
|
||||
- url: http://observability-loki.observability.svc.cluster.local:3100/loki/api/v1/push
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://grafana.github.io/helm-charts
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: prometheus-community
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://prometheus-community.github.io/helm-charts
|
||||
@@ -1,10 +1,8 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- grafana-admin-externalsecret.yaml
|
||||
- helmrepository-prometheus-community.yaml
|
||||
- helmrepository-grafana.yaml
|
||||
- ocirepository-loki.yaml
|
||||
- ocirepository-promtail.yaml
|
||||
- helmrelease-kube-prometheus-stack.yaml
|
||||
- helmrelease-loki.yaml
|
||||
- helmrelease-promtail.yaml
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/grafana/helm-charts/loki
|
||||
ref:
|
||||
tag: 6.46.0
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/grafana/helm-charts/promtail
|
||||
ref:
|
||||
tag: 6.16.6
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -1,17 +0,0 @@
|
||||
apiVersion: resources.cattle.io/v1
|
||||
kind: Backup
|
||||
metadata:
|
||||
name: rancher-b2-recurring
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
resourceSetName: rancher-resource-set-full
|
||||
storageLocation:
|
||||
s3:
|
||||
credentialSecretName: rancher-b2-creds
|
||||
credentialSecretNamespace: cattle-resources-system
|
||||
bucketName: HetznerTerra
|
||||
folder: rancher-backups
|
||||
endpoint: s3.us-east-005.backblazeb2.com
|
||||
region: us-east-005
|
||||
schedule: "0 3 * * *"
|
||||
retentionCount: 7
|
||||
@@ -1,19 +0,0 @@
|
||||
# Uncomment and set backupFilename to restore from a specific backup on rebuild.
|
||||
# Find the latest backup filename in B2: rancher-backups/ folder.
|
||||
# After restore succeeds, Rancher will have all users/settings from the backup.
|
||||
#
|
||||
# apiVersion: resources.cattle.io/v1
|
||||
# kind: Restore
|
||||
# metadata:
|
||||
# name: restore-from-b2
|
||||
# namespace: cattle-resources-system
|
||||
# spec:
|
||||
# backupFilename: rancher-b2-manual-test-0a416444-2c8a-4d34-8a07-d9e406750374-2026-03-30T00-08-02Z.tar.gz
|
||||
# storageLocation:
|
||||
# s3:
|
||||
# credentialSecretName: rancher-b2-creds
|
||||
# credentialSecretNamespace: cattle-resources-system
|
||||
# bucketName: HetznerTerra
|
||||
# folder: rancher-backups
|
||||
# endpoint: s3.us-east-005.backblazeb2.com
|
||||
# region: us-east-005
|
||||
@@ -1,25 +0,0 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: rancher-b2-creds
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: rancher-b2-creds
|
||||
creationPolicy: Owner
|
||||
template:
|
||||
type: Opaque
|
||||
data:
|
||||
accessKey: "{{ .B2_ACCOUNT_ID }}"
|
||||
secretKey: "{{ .B2_APPLICATION_KEY }}"
|
||||
data:
|
||||
- secretKey: B2_ACCOUNT_ID
|
||||
remoteRef:
|
||||
key: B2_ACCOUNT_ID
|
||||
- secretKey: B2_APPLICATION_KEY
|
||||
remoteRef:
|
||||
key: B2_APPLICATION_KEY
|
||||
@@ -1,23 +0,0 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: rancher-backup-crd
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cattle-resources-system
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher-backup-crd
|
||||
version: "106.0.2+up8.1.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
@@ -1,42 +0,0 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: rancher-backup
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: cattle-resources-system
|
||||
dependsOn:
|
||||
- name: rancher-backup-crd
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher-backup
|
||||
version: "106.0.2+up8.1.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
image:
|
||||
repository: rancher/backup-restore-operator
|
||||
kubectl:
|
||||
image:
|
||||
repository: rancher/kubectl
|
||||
tag: "v1.34.0"
|
||||
postRenderers:
|
||||
- kustomize:
|
||||
patches:
|
||||
- target:
|
||||
kind: Job
|
||||
name: rancher-backup-patch-sa
|
||||
patch: |
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/image
|
||||
value: rancher/kubectl:v1.34.0
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: rancher-charts
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://charts.rancher.io
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-rancher-backup.yaml
|
||||
- helmrelease-rancher-backup-crd.yaml
|
||||
- helmrelease-rancher-backup.yaml
|
||||
- b2-credentials-externalsecret.yaml
|
||||
@@ -1,4 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cattle-resources-system
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rancher-bootstrap-password-flux-externalsecret.yaml
|
||||
- rancher-bootstrap-password-externalsecret.yaml
|
||||
@@ -5,14 +5,14 @@ metadata:
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 15m
|
||||
targetNamespace: cattle-system
|
||||
chart:
|
||||
spec:
|
||||
chart: rancher
|
||||
version: "2.13.3"
|
||||
chart: ./infrastructure/charts/rancher
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: rancher-stable
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
@@ -23,10 +23,18 @@ spec:
|
||||
retries: 3
|
||||
values:
|
||||
hostname: rancher.silverside-gopher.ts.net
|
||||
systemDefaultRegistry: registry.rancher.com
|
||||
replicas: 1
|
||||
extraEnv:
|
||||
- name: CATTLE_PROMETHEUS_METRICS
|
||||
value: "true"
|
||||
- name: CATTLE_FEATURES
|
||||
value: "managed-system-upgrade-controller=false"
|
||||
webhook:
|
||||
image:
|
||||
repository: rancher/rancher-webhook
|
||||
tag: v0.9.3
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
@@ -34,6 +42,10 @@ spec:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
startupProbe:
|
||||
timeoutSeconds: 5
|
||||
periodSeconds: 10
|
||||
failureThreshold: 60
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: rancher-stable
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://releases.rancher.com/server-charts/stable
|
||||
@@ -1,9 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-rancher.yaml
|
||||
- helmrelease-rancher.yaml
|
||||
- rancher-bootstrap-password-flux-externalsecret.yaml
|
||||
- rancher-bootstrap-password-externalsecret.yaml
|
||||
- rancher-tailscale-service.yaml
|
||||
|
||||
@@ -8,11 +8,10 @@ spec:
|
||||
targetNamespace: tailscale-system
|
||||
chart:
|
||||
spec:
|
||||
chart: tailscale-operator
|
||||
version: 1.96.5
|
||||
chart: ./infrastructure/charts/tailscale-operator
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: tailscale
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
@@ -28,6 +27,10 @@ spec:
|
||||
operatorConfig:
|
||||
defaultTags:
|
||||
- tag:k8s
|
||||
image:
|
||||
repository: ghcr.io/tailscale/k8s-operator
|
||||
tag: v1.96.5
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
@@ -37,3 +40,6 @@ spec:
|
||||
proxyConfig:
|
||||
defaultTags: tag:k8s
|
||||
defaultProxyClass: infra-stable
|
||||
image:
|
||||
repository: ghcr.io/tailscale/tailscale
|
||||
tag: v1.96.5
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: tailscale
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://pkgs.tailscale.com/helmcharts
|
||||
@@ -2,5 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrepository-tailscale.yaml
|
||||
- helmrelease-tailscale-operator.yaml
|
||||
|
||||
@@ -8,11 +8,10 @@ spec:
|
||||
targetNamespace: kube-system
|
||||
chart:
|
||||
spec:
|
||||
chart: traefik
|
||||
version: "39.0.0"
|
||||
chart: ./infrastructure/charts/traefik
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: traefik
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: https://traefik.github.io/charts
|
||||
provider: generic
|
||||
@@ -1,5 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrepository-traefik.yaml
|
||||
- helmrelease-traefik.yaml
|
||||
@@ -0,0 +1,26 @@
|
||||
annotations:
|
||||
artifacthub.io/category: security
|
||||
artifacthub.io/license: Apache-2.0
|
||||
artifacthub.io/prerelease: "false"
|
||||
artifacthub.io/signKey: |
|
||||
fingerprint: 1020CF3C033D4F35BAE1C19E1226061C665DF13E
|
||||
url: https://cert-manager.io/public-keys/cert-manager-keyring-2021-09-20-1020CF3C033D4F35BAE1C19E1226061C665DF13E.gpg
|
||||
apiVersion: v2
|
||||
appVersion: v1.17.2
|
||||
description: A Helm chart for cert-manager
|
||||
home: https://cert-manager.io
|
||||
icon: https://raw.githubusercontent.com/cert-manager/community/4d35a69437d21b76322157e6284be4cd64e6d2b7/logo/logo-small.png
|
||||
keywords:
|
||||
- cert-manager
|
||||
- kube-lego
|
||||
- letsencrypt
|
||||
- tls
|
||||
kubeVersion: '>= 1.22.0-0'
|
||||
maintainers:
|
||||
- email: cert-manager-maintainers@googlegroups.com
|
||||
name: cert-manager-maintainers
|
||||
url: https://cert-manager.io
|
||||
name: cert-manager
|
||||
sources:
|
||||
- https://github.com/cert-manager/cert-manager
|
||||
version: v1.17.2
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user