Compare commits
320 Commits
v0.1.0-stable
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| e9327b0c61 | |||
| cf49f8bf03 | |||
| d57e8c8fe8 | |||
| 93a2a42917 | |||
| 5cf68771dd | |||
| 6d6e3e8371 | |||
| 353a408dac | |||
| b3612083ad | |||
| 8c0dbd997d | |||
| 3a975a323c | |||
| d126de4dc4 | |||
| a33a993867 | |||
| f52e657f9f | |||
| f49b08f50c | |||
| 327bb860b7 | |||
| fd5451a5ef | |||
| 7333cb2780 | |||
| feecf97cd5 | |||
| b5bcec2663 | |||
| 0ad56405ee | |||
| d050e8962a | |||
| d925eeac3f | |||
| 2bde45e106 | |||
| 50752ca4b0 | |||
| a2ed9555c0 | |||
| 14462dd870 | |||
| 0625eee297 | |||
| 2dc4ab6329 | |||
| bbec0dfff4 | |||
| 6de826e030 | |||
| bdba2b7af2 | |||
| 499a3462e7 | |||
| daf6ccd0e4 | |||
| a6a630000a | |||
| ff9e58d44f | |||
| 8b94e4dd06 | |||
| 547a29e000 | |||
| 760f0482d4 | |||
| 440e268e4f | |||
| 24851f5a9b | |||
| ded8efe7fb | |||
| c10646d228 | |||
| 50d97209e6 | |||
| 46b2ff7d19 | |||
| a4f1d179e9 | |||
| 9879de5a86 | |||
| 195e9bce25 | |||
| 4796606432 | |||
| b1eab6a0fa | |||
| f3c96b65d2 | |||
| c7a375758f | |||
| d0be48b65c | |||
| 40647318b4 | |||
| cdb26904d2 | |||
| 3c06e046c2 | |||
| 17f1815e7f | |||
| 66e86e55ea | |||
| 43df412243 | |||
| 383ef9e9ac | |||
| 18abc5073b | |||
| f8da2594ca | |||
| e0359f0097 | |||
| 003333a061 | |||
| a6071c504b | |||
| 08123457f1 | |||
| 757d88ed52 | |||
| 15defc686f | |||
| abb7578328 | |||
| bc87a7ca43 | |||
| 045880bdd6 | |||
| bfcf57bcc5 | |||
| 7e3ebec95b | |||
| 0c31c3b1d5 | |||
| 5523feb563 | |||
| cafa2fa0b3 | |||
| a7fd4c0b97 | |||
| e56a3a6c38 | |||
| 7b2eca07ab | |||
| 347ca041ba | |||
| 3f52bad854 | |||
| c89c31adea | |||
| 68b293efe4 | |||
| 1f465cc0c1 | |||
| 6e22bd26b3 | |||
| 869880c152 | |||
| 31e95eb227 | |||
| 12675417bd | |||
| 8e081ddfda | |||
| 4b7517c9c5 | |||
| f9bc53723f | |||
| ee6417c18e | |||
| 1156dc0203 | |||
| 4151027e01 | |||
| 9269e9df1b | |||
| d9374bc209 | |||
| c570a476b5 | |||
| a7f11ccf94 | |||
| a7d540ca65 | |||
| 098bd98876 | |||
| 55d7b8201e | |||
| 9c0523e880 | |||
| 8372d562ad | |||
| 1bb11dfe3a | |||
| 624cd5aab6 | |||
| 71bdc6a709 | |||
| 714f20417b | |||
| c32bec34bc | |||
| 6519a7673d | |||
| d1c31cdb91 | |||
| b3e88712bd | |||
| 06366ee5e6 | |||
| 9a2d213114 | |||
| 9482a0f551 | |||
| 5c53b8e06e | |||
| b1dae28aa5 | |||
| 6c6b9d20ca | |||
| c3a2f25c94 | |||
| 7385c2263e | |||
| 60f466ab98 | |||
| b20356e9fe | |||
| 2ba6b6a896 | |||
| 9126de1423 | |||
| 4532b9ed74 | |||
| 68dbd2e5b7 | |||
| ceefcc3b29 | |||
| 0d339b3163 | |||
| 30ccf13c82 | |||
| 75e3604f30 | |||
| e4235a6e58 | |||
| ea2d534171 | |||
| a1b9fe6aa6 | |||
| 33765657ec | |||
| b8f64fa952 | |||
| 569d741751 | |||
| 89e53d9ec9 | |||
| 5a2551f40a | |||
| 8c7b62c024 | |||
| a1f07f863a | |||
| 2c3a49c2e0 | |||
| a7ce3dcc1a | |||
| 0ab9418458 | |||
| c251672618 | |||
| 89364e8f37 | |||
| 20d7a6f777 | |||
| 22ce5fd6f4 | |||
| afb1782d38 | |||
| 48870433bf | |||
| f2c506b350 | |||
| efdf13976a | |||
| 5269884408 | |||
| 6e5b0518be | |||
| 905d069e91 | |||
| 25ba4b7115 | |||
| 6a593fd559 | |||
| 936f54a1b5 | |||
| c9df11e65f | |||
| a3c238fda9 | |||
| a15fa50302 | |||
| 0f4f0b09fb | |||
| 4c002a870c | |||
| 43d11ac7e6 | |||
| 8c5edcf0a1 | |||
| a81da0d178 | |||
| 2a72527c79 | |||
| 7cb3b84ecb | |||
| d4930235fa | |||
| ee8dc4b451 | |||
| 144d40e7ac | |||
| cc14e32572 | |||
| a207a5a7fd | |||
| 4e1772c175 | |||
| ff70b12084 | |||
| a3963c56e6 | |||
| 612435c42c | |||
| ac42f671a2 | |||
| dbe7ec0468 | |||
| 816ac8b3c0 | |||
| 6f7998639f | |||
| 7a14f89ad1 | |||
| 786901c5d7 | |||
| 46f3d1130b | |||
| 2fe5a626d4 | |||
| 2ef68c8087 | |||
| e2cae18f5f | |||
| e0c1e41ee9 | |||
| 63533de901 | |||
| 1b39710f63 | |||
| 8c034323dc | |||
| 5fa2b411ee | |||
| 3ea28e525f | |||
| 4b95ba113d | |||
| 13627bf81f | |||
| ef3fb2489a | |||
| 7097495d72 | |||
| 9d601dc77c | |||
| f36445d99a | |||
| 89c2c99963 | |||
| 4a35cfb549 | |||
| 3d50bfc534 | |||
| ab2f287bfb | |||
| dcb2675b67 | |||
| b40bec7e0e | |||
| efe0c0cfd5 | |||
| c61d9f9c1d | |||
| 60ceac4624 | |||
| 47b384a337 | |||
| ecf17113fb | |||
| 4ffbcfa312 | |||
| 8745bcda47 | |||
| e47ec2a3e7 | |||
| 45c899d2bd | |||
| 0e52d8f159 | |||
| 4726db2b5b | |||
| 90d105e5ea | |||
| 952a80a742 | |||
| 4965017b86 | |||
| b2b9c38b91 | |||
| ff31cb4e74 | |||
| 8b4a445b37 | |||
| e447795395 | |||
| 31b82c9371 | |||
| cadfedacf1 | |||
| 561cd67b0c | |||
| 4eebbca648 | |||
| 7b5d794dfc | |||
| 8643bbfc12 | |||
| 84f446c2e6 | |||
| d446e86ece | |||
| 90c7f565e0 | |||
| 989848fa89 | |||
| 56e5807474 | |||
| df0511148c | |||
| 894e6275b1 | |||
| a01cf435d4 | |||
| 84f77c4a68 | |||
| 2e4196688c | |||
| 8d1f9f4944 | |||
| d4fd43e2f5 | |||
| 48a80c362c | |||
| fcf7f139ff | |||
| 7139ae322d | |||
| 528a8dc210 | |||
| 349f75729a | |||
| 522626a52b | |||
| 5bd4c41c2d | |||
| 3e41f71b1b | |||
| 9d2f30de32 | |||
| 08a3031276 | |||
| e3ce91db62 | |||
| bed8e4afc8 | |||
| 2d4de6cff8 | |||
| 4a83d981c8 | |||
| d188a51ef6 | |||
| 646ef16258 | |||
| 6f2e056b98 | |||
| e10a70475f | |||
| f95e0051a5 | |||
| 7c15ac5846 | |||
| 4c104f74e8 | |||
| be04602bfb | |||
| 06c1356f1e | |||
| 86fb5d5b90 | |||
| 8b403cd1d6 | |||
| 480a079dc8 | |||
| ff8e32daf5 | |||
| eb1ad0bea7 | |||
| 9ff9d1e633 | |||
| 6177b581e4 | |||
| b1e21c4a4b | |||
| 2f166ed9e7 | |||
| 1c39274df7 | |||
| 28eaa36ec4 | |||
| 02fa71c0aa | |||
| 2bbf05cdca | |||
| 213c1fb4e4 | |||
| 414ac73c25 | |||
| 542d7a6be5 | |||
| 210b617cc9 | |||
| 3686249e31 | |||
| f56d1447c1 | |||
| 63247b79a6 | |||
| f6e159406a | |||
| 0ae1c9395c | |||
| 272c5ddc6e | |||
| eb6bf3862a | |||
| 5a3f7550fe | |||
| a0ed6523ec | |||
| 4f61a840c7 | |||
| d876430703 | |||
| 56b6216257 | |||
| 91fe2e658c | |||
| 13cec1aa28 | |||
| bc133e65d3 | |||
| df4fdb5496 | |||
| cec7c42efb | |||
| ee692620b5 | |||
| a6d327fa1f | |||
| fe6cb39eaf | |||
| feaefd28a1 | |||
| 80ab59e22d | |||
| 6c0282e9d5 | |||
| 45aa616741 | |||
| b595c1738a | |||
| 1c4dfd7fae | |||
| 6b9fc1f6b8 | |||
| 2b5cad9d15 | |||
| 71a1495fbc | |||
| fe3814e0e3 | |||
| 5ab3c7a0ac | |||
| 9bc708ea4b | |||
| c0a4275f15 | |||
| 3dcf71a84f | |||
| 124fe94d0e | |||
| 2d3f63424a | |||
| 2a583d1bba | |||
| 27711e0661 | |||
| 10ee303995 | |||
| 558f34e2b1 | |||
| 58fabf23f8 | |||
| b30977a158 |
@@ -0,0 +1,88 @@
|
||||
name: Deploy Grafana Content
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "ansible/dashboards.yml"
|
||||
- "ansible/roles/observability-content/**"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: prod-cluster
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.14.9"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
dashboards:
|
||||
name: Grafana Content
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
pip3 install ansible==8.7.0 kubernetes==26.1.0 jinja2==3.1.5 pyyaml==6.0.2
|
||||
|
||||
- name: Install Ansible Collections
|
||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||
|
||||
- name: Generate Ansible Inventory
|
||||
working-directory: ansible
|
||||
run: python3 generate_inventory.py
|
||||
|
||||
- name: Apply dashboards and datasources
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible-playbook dashboards.yml \
|
||||
-e "cluster_name=k8s-cluster"
|
||||
env:
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
|
||||
- name: Verify Grafana content resources
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
|
||||
env:
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
+865
-97
File diff suppressed because it is too large
Load Diff
@@ -8,19 +8,26 @@ on:
|
||||
required: true
|
||||
default: ''
|
||||
|
||||
concurrency:
|
||||
group: prod-cluster
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VERSION: "1.14.9"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
destroy:
|
||||
name: Destroy Cluster
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
steps:
|
||||
@@ -31,17 +38,7 @@ jobs:
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
@@ -51,11 +48,66 @@ jobs:
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Terraform Destroy
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform destroy \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Save Proxmox target list
|
||||
run: |
|
||||
mkdir -p outputs
|
||||
if ! terraform -chdir=terraform output -json proxmox_target_vms > outputs/proxmox_target_vms.json; then
|
||||
terraform -chdir=terraform plan \
|
||||
-refresh=false \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-out=cleanup.tfplan \
|
||||
-no-color || true
|
||||
printf '[]' > outputs/proxmox_target_vms.json
|
||||
fi
|
||||
|
||||
- name: Terraform Destroy
|
||||
id: destroy
|
||||
working-directory: terraform
|
||||
run: |
|
||||
set +e
|
||||
for attempt in 1 2 3; do
|
||||
echo "Terraform destroy attempt ${attempt}/3"
|
||||
terraform destroy \
|
||||
-parallelism=2 \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve
|
||||
rc=$?
|
||||
if [ "$rc" -eq 0 ]; then
|
||||
exit 0
|
||||
fi
|
||||
if [ "$attempt" -lt 3 ]; then
|
||||
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
|
||||
sleep 30
|
||||
terraform refresh \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
|
||||
fi
|
||||
done
|
||||
exit "$rc"
|
||||
|
||||
- name: Verify Proxmox target VMs removed
|
||||
if: success()
|
||||
run: |
|
||||
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --targets-file outputs/proxmox_target_vms.json
|
||||
if [ -f terraform/cleanup.tfplan ]; then
|
||||
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --terraform-dir terraform --plan cleanup.tfplan
|
||||
fi
|
||||
|
||||
- name: Terraform state diagnostics
|
||||
if: failure() && steps.destroy.outcome == 'failure'
|
||||
run: |
|
||||
terraform -chdir=terraform state list || true
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
*.tfstate.*
|
||||
*.tfstate.backup
|
||||
.terraform/
|
||||
.terraform.lock.hcl
|
||||
terraform.tfvars
|
||||
crash.log
|
||||
override.tf
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
# AGENTS.md
|
||||
|
||||
Compact repo guidance for OpenCode sessions. Trust executable sources over docs when they conflict.
|
||||
|
||||
## Read First
|
||||
|
||||
- Highest-value sources: `.gitea/workflows/deploy.yml`, `.gitea/workflows/destroy.yml`, `terraform/main.tf`, `terraform/variables.tf`, `terraform/servers.tf`, `ansible/site.yml`, `ansible/inventory.tmpl`, `clusters/prod/flux-system/`, `infrastructure/addons/kustomization.yaml`.
|
||||
- `STABLE_BASELINE.md` still contains stale Rancher backup/restore references; current workflows and addon manifests do not deploy or restore `rancher-backup`.
|
||||
|
||||
## Baseline
|
||||
|
||||
- Proxmox HA K3s cluster: 3 control planes, 5 workers, VMIDs `200-202` and `210-214`, node `flex`, template VMID `9000`, datastore `Flash`.
|
||||
- API HA is kube-vip at `10.27.27.40`; control planes are `10.27.27.30-32`, workers are `10.27.27.41-45`.
|
||||
- SSH user is `ubuntu`; Ansible derives the flannel iface from `ansible_default_ipv4.interface` with `eth0` fallback, so do not hard-code `ens18`.
|
||||
- Storage is raw-manifest `nfs-subdir-external-provisioner` using `10.27.27.239:/TheFlash/k8s-nfs` and default StorageClass `flash-nfs`.
|
||||
- Tailscale is the private access path. Rancher, Grafana, and Prometheus are exposed only through Tailscale services.
|
||||
- `apps` is intentionally suspended in `clusters/prod/flux-system/kustomization-apps.yaml`.
|
||||
|
||||
## Commands
|
||||
|
||||
- Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`.
|
||||
- Ansible setup: `ansible-galaxy collection install -r ansible/requirements.yml`, then from `ansible/` run `python3 generate_inventory.py` and `ansible-playbook site.yml --syntax-check`.
|
||||
- Flux/Kustomize checks: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize infrastructure/addons`, `kubectl kustomize clusters/prod/flux-system`.
|
||||
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-ip>`; use this if local `kubectl` falls back to `localhost:8080` after rebuilds.
|
||||
- Tailnet smoke check from cp1: `ssh ubuntu@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`.
|
||||
- Fast Grafana content iteration uses `.gitea/workflows/dashboards.yml` and `ansible/dashboards.yml`, not a full cluster rebuild.
|
||||
|
||||
## Deploy Flow
|
||||
|
||||
- Pushes to `main` run Gitea CI: Terraform fmt/init/validate/plan/apply, Proxmox cleanup/retry, Ansible bootstrap, Flux bootstrap, addon gates, Rancher gate, observability image seeding, health checks, tailnet smoke checks.
|
||||
- Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate.
|
||||
- Keep `set -euo pipefail` in workflow shell blocks.
|
||||
- Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs.
|
||||
- Fresh VMs have unreliable registry/chart egress, so critical images are prepared by `skopeo` on the runner and imported with `k3s ctr`; update the workflow archive lists when adding bootstrap-time images.
|
||||
- CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap.
|
||||
|
||||
## GitOps Addons
|
||||
|
||||
- Vendored charts are intentional: `infrastructure/charts/{cert-manager,traefik,kube-prometheus-stack,tailscale-operator,rancher}`. Do not restore remote `HelmRepository` objects unless cluster-side chart fetch reliability is intentionally changed.
|
||||
- External Secrets and Loki/Promtail use Flux `OCIRepository`; Rancher, Tailscale, cert-manager, Traefik, and kube-prometheus-stack use `GitRepository` chart paths.
|
||||
- Use fully qualified `helmchart.source.toolkit.fluxcd.io/...` in scripts; K3s also has `helmcharts.helm.cattle.io`, so `helmchart/...` can target the wrong resource.
|
||||
- `doppler-bootstrap` only creates the `external-secrets` namespace and Doppler token secret. The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
|
||||
- The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by that addon kustomization; do not assume Flux applies it.
|
||||
- Keep Kubernetes manifests one object per file with kebab-case filenames.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`; K3s `latest` can break Rancher. Role defaults pin `v1.34.6+k3s1`; do not reintroduce a generated-inventory `k3s_version=latest` override.
|
||||
- The repo no longer uses a cloud controller manager. `providerID`, Hetzner CCM/CSI, or Hetzner firewall/load-balancer logic is stale.
|
||||
- Tailscale cleanup must only remove stale offline reserved hostnames before live service proxies exist; do not delete active `rancher`, `grafana`, `prometheus`, or `flux` devices.
|
||||
- Proxmox endpoint should be the base URL, for example `https://100.105.0.115:8006/`; provider/workflow code strips `/api2/json` when needed.
|
||||
- Current private URLs: Rancher `https://rancher.silverside-gopher.ts.net/`, Grafana `http://grafana.silverside-gopher.ts.net/`, Prometheus `http://prometheus.silverside-gopher.ts.net:9090/`.
|
||||
|
||||
## Secrets
|
||||
|
||||
- Runtime secrets are Doppler + External Secrets; Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
|
||||
- Never commit secrets, kubeconfigs, private keys, `terraform.tfvars`, or generated `outputs/` artifacts.
|
||||
@@ -0,0 +1,287 @@
|
||||
# App Repo Deployment Guide
|
||||
|
||||
This guide explains the recommended way to deploy an application to this cluster.
|
||||
|
||||
## Recommended Model
|
||||
|
||||
Use two repos:
|
||||
|
||||
- `HetznerTerra` (this repo): cluster, addons, shared infrastructure, Flux wiring
|
||||
- `your-app-repo`: application source, Dockerfile, CI, Kubernetes manifests or Helm chart
|
||||
|
||||
Why:
|
||||
|
||||
- cluster lifecycle stays separate from app code
|
||||
- app CI can build and tag images independently
|
||||
- this repo remains the source of truth for what the cluster is allowed to deploy
|
||||
|
||||
## Current Cluster Assumptions
|
||||
|
||||
- Flux is already installed and reconciles this repo from `main`
|
||||
- `clusters/prod/flux-system/kustomization-apps.yaml` points at `./apps`
|
||||
- `apps` is suspended by default
|
||||
- private access is through Tailscale
|
||||
- runtime secrets should come from Doppler via External Secrets
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Option A: Separate app repo
|
||||
|
||||
Recommended for most real applications.
|
||||
|
||||
Flow:
|
||||
|
||||
1. App repo builds and pushes an image.
|
||||
2. This repo defines a `GitRepository` pointing at the app repo.
|
||||
3. This repo defines a `Kustomization` pointing at a path in the app repo.
|
||||
4. Flux pulls the app repo and applies the manifests.
|
||||
|
||||
### Option B: In-repo app manifests
|
||||
|
||||
Only use this when the application is tiny or tightly coupled to the platform.
|
||||
|
||||
Flow:
|
||||
|
||||
1. Put Kubernetes manifests directly under `apps/` in this repo.
|
||||
2. Unsuspend the top-level `apps` Kustomization.
|
||||
|
||||
This is simpler, but mixes platform and app changes together.
|
||||
|
||||
## App Repo Structure
|
||||
|
||||
Suggested layout:
|
||||
|
||||
```text
|
||||
your-app-repo/
|
||||
├── src/
|
||||
├── Dockerfile
|
||||
├── .gitea/workflows/
|
||||
└── deploy/
|
||||
├── base/
|
||||
│ ├── namespace.yaml
|
||||
│ ├── deployment.yaml
|
||||
│ ├── service.yaml
|
||||
│ ├── externalsecret.yaml
|
||||
│ └── kustomization.yaml
|
||||
└── prod/
|
||||
├── kustomization.yaml
|
||||
└── patch-*.yaml
|
||||
```
|
||||
|
||||
If you prefer Helm, replace `deploy/base` and `deploy/prod` with a chart path and point Flux at that instead.
|
||||
|
||||
## What the App Repo Should Own
|
||||
|
||||
- application source code
|
||||
- image build pipeline
|
||||
- image tag strategy
|
||||
- Deployment / Service / Ingress or Tailscale-facing Service manifests
|
||||
- app-specific `ExternalSecret` manifests
|
||||
- app-specific namespace
|
||||
|
||||
## What This Repo Should Own
|
||||
|
||||
- cluster-level permission to deploy the app
|
||||
- the `GitRepository` and top-level `Kustomization` that attach the app repo to the cluster
|
||||
- whether the `apps` layer is suspended or active
|
||||
|
||||
## Recommended First App Integration
|
||||
|
||||
In this repo, add Flux objects under `apps/` that point to the app repo.
|
||||
|
||||
Example files to add:
|
||||
|
||||
- `apps/gitrepository-my-app.yaml`
|
||||
- `apps/kustomization-my-app.yaml`
|
||||
- update `apps/kustomization.yaml`
|
||||
|
||||
Example `apps/gitrepository-my-app.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: GitRepository
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m
|
||||
ref:
|
||||
branch: main
|
||||
secretRef:
|
||||
name: flux-system
|
||||
url: ssh://git@<your-git-host>:<port>/<org>/<your-app-repo>.git
|
||||
```
|
||||
|
||||
Example `apps/kustomization-my-app.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: my-app
|
||||
path: ./deploy/prod
|
||||
wait: true
|
||||
timeout: 5m
|
||||
dependsOn:
|
||||
- name: infrastructure
|
||||
```
|
||||
|
||||
Then update `apps/kustomization.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- gitrepository-my-app.yaml
|
||||
- kustomization-my-app.yaml
|
||||
```
|
||||
|
||||
## App Secrets
|
||||
|
||||
Recommended path:
|
||||
|
||||
1. Put runtime values in Doppler.
|
||||
2. In the app manifests, create an `ExternalSecret` that reads from `doppler-hetznerterra`.
|
||||
3. Reference the resulting Kubernetes Secret from the Deployment.
|
||||
|
||||
Example app-side `ExternalSecret`:
|
||||
|
||||
```yaml
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: my-app-env
|
||||
namespace: my-app
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: my-app-env
|
||||
creationPolicy: Owner
|
||||
data:
|
||||
- secretKey: DATABASE_URL
|
||||
remoteRef:
|
||||
key: MY_APP_DATABASE_URL
|
||||
```
|
||||
|
||||
## Image Delivery
|
||||
|
||||
Recommended flow:
|
||||
|
||||
1. App repo CI builds a container image.
|
||||
2. CI pushes it to a registry.
|
||||
3. The app repo updates the Kubernetes image tag in `deploy/prod`.
|
||||
4. Flux notices the Git change and deploys it.
|
||||
|
||||
Keep the first version simple. Do not add image automation until the basic deploy path is proven.
|
||||
|
||||
## Exposing the App
|
||||
|
||||
Pick one:
|
||||
|
||||
### Private app over Tailscale
|
||||
|
||||
Best fit for this cluster right now.
|
||||
|
||||
Create a Service like the existing Rancher/Grafana/Prometheus pattern:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: my-app-tailscale
|
||||
namespace: my-app
|
||||
annotations:
|
||||
tailscale.com/hostname: my-app
|
||||
tailscale.com/tags: "tag:prod"
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: my-app
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 3000
|
||||
```
|
||||
|
||||
Use `http://my-app.<your-tailnet>` or your chosen hostname.
|
||||
|
||||
### Cluster-internal only
|
||||
|
||||
Create only a `ClusterIP` Service.
|
||||
|
||||
### Public ingress
|
||||
|
||||
Not recommended as the first app path in this repo. Get the private path working first.
|
||||
|
||||
## Enabling the Apps Layer
|
||||
|
||||
The cluster-wide `apps` Kustomization is suspended by default.
|
||||
|
||||
When you are ready to let Flux deploy app attachments from `apps/`, unsuspend it:
|
||||
|
||||
```bash
|
||||
kubectl -n flux-system patch kustomization apps --type=merge -p '{"spec":{"suspend":false}}'
|
||||
```
|
||||
|
||||
Or commit a change to `clusters/prod/flux-system/kustomization-apps.yaml` changing:
|
||||
|
||||
```yaml
|
||||
suspend: true
|
||||
```
|
||||
|
||||
to:
|
||||
|
||||
```yaml
|
||||
suspend: false
|
||||
```
|
||||
|
||||
## First Deploy Checklist
|
||||
|
||||
Before deploying the first app, make sure:
|
||||
|
||||
1. app image builds successfully
|
||||
2. app repo contains valid `deploy/prod` manifests
|
||||
3. this repo contains the `GitRepository` + `Kustomization` attachment objects
|
||||
4. required Doppler secrets exist
|
||||
5. `apps` is unsuspended if you are using the top-level `apps` layer
|
||||
|
||||
## Verification Commands
|
||||
|
||||
From a machine with cluster access:
|
||||
|
||||
```bash
|
||||
kubectl -n flux-system get gitrepositories,kustomizations
|
||||
kubectl get ns
|
||||
kubectl -n my-app get deploy,svc,pods,externalsecret,secret
|
||||
```
|
||||
|
||||
If private over Tailscale:
|
||||
|
||||
```bash
|
||||
kubectl -n my-app get svc my-app-tailscale -o wide
|
||||
```
|
||||
|
||||
## Minimal Recommendation
|
||||
|
||||
If you want the simplest, lowest-risk first deploy:
|
||||
|
||||
1. create a separate app repo
|
||||
2. add `deploy/base` + `deploy/prod`
|
||||
3. add a `GitRepository` + `Kustomization` in this repo under `apps/`
|
||||
4. keep the app private with a Tailscale `LoadBalancer` Service
|
||||
5. use Doppler + `ExternalSecret` for runtime config
|
||||
|
||||
That matches the current cluster design with the least surprise.
|
||||
@@ -1,281 +1,284 @@
|
||||
# Hetzner Kubernetes Cluster
|
||||
# Proxmox Kubernetes Cluster
|
||||
|
||||
Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible.
|
||||
Private HA K3s cluster on Proxmox, provisioned by Terraform, bootstrapped by Ansible, and reconciled by Flux.
|
||||
|
||||
## Architecture
|
||||
|
||||
| Component | Details |
|
||||
|-----------|---------|
|
||||
| **Control Plane** | 3x CX23 (HA) |
|
||||
| **Workers** | 4x CX33 |
|
||||
| **Total Cost** | €28.93/mo |
|
||||
| **K8s** | k3s (latest, HA) |
|
||||
| **Addons** | Hetzner CCM + CSI |
|
||||
| **Access** | SSH/API restricted to Tailnet |
|
||||
| **Bootstrap** | Terraform + Ansible |
|
||||
| Component | Current Baseline |
|
||||
|-----------|------------------|
|
||||
| **Control plane** | 3 Proxmox VMs, VMIDs `200-202`, IPs `10.27.27.30-32`, 2 vCPU / 4 GiB / 32 GiB |
|
||||
| **Workers** | 5 Proxmox VMs, VMIDs `210-214`, IPs `10.27.27.41-45`, 4 vCPU / 8 GiB / 64 GiB |
|
||||
| **Kubernetes** | K3s `v1.34.6+k3s1`, HA embedded etcd, kube-vip API VIP `10.27.27.40` |
|
||||
| **Proxmox** | Node `flex`, template VMID `9000`, datastore `Flash`, bridge `vmbr0` |
|
||||
| **Storage** | Raw-manifest `nfs-subdir-external-provisioner`, `10.27.27.239:/TheFlash/k8s-nfs`, default StorageClass `flash-nfs` |
|
||||
| **GitOps** | Flux source `platform` on branch `main`; `apps` Kustomization is intentionally suspended |
|
||||
| **Private access** | Tailscale operator exposes Rancher, Grafana, and Prometheus; no public ingress baseline |
|
||||
| **Runtime secrets** | Doppler service token bootstraps External Secrets Operator |
|
||||
|
||||
### Cluster Resources
|
||||
- 22 vCPU total (6 CP + 16 workers)
|
||||
- 44 GB RAM total (12 CP + 32 workers)
|
||||
- 440 GB SSD storage
|
||||
- 140 TB bandwidth allocation
|
||||
K3s is pinned because Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. Hetzner Cloud API Token
|
||||
- Terraform `>= 1.0`.
|
||||
- Ansible with Python `jinja2` and `pyyaml`.
|
||||
- `kubectl` for local verification.
|
||||
- Proxmox API token for the `bpg/proxmox` provider.
|
||||
- S3-compatible bucket for Terraform state, currently Backblaze B2.
|
||||
- SSH key pair available to Terraform and Ansible, defaulting to `~/.ssh/infra` and `~/.ssh/infra.pub`.
|
||||
|
||||
1. Go to [Hetzner Cloud Console](https://console.hetzner.com/)
|
||||
2. Select your project (or create a new one)
|
||||
3. Navigate to **Security** → **API Tokens**
|
||||
4. Click **Generate API Token**
|
||||
5. Set description: `k8s-cluster-terraform`
|
||||
6. Select permissions: **Read & Write**
|
||||
7. Click **Generate API Token**
|
||||
8. **Copy the token immediately** - it won't be shown again!
|
||||
Expected Proxmox inputs:
|
||||
|
||||
### 2. Backblaze B2 Bucket (for Terraform State)
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| Endpoint | `https://100.105.0.115:8006/` |
|
||||
| Node | `flex` |
|
||||
| Clone source | Template VMID `9000` (`ubuntu-2404-k8s-template`) |
|
||||
| Storage | `Flash` |
|
||||
|
||||
1. Go to [Backblaze B2](https://secure.backblaze.com/b2_buckets.htm)
|
||||
2. Click **Create a Bucket**
|
||||
3. Set bucket name: `k8s-terraform-state` (must be globally unique)
|
||||
4. Choose **Private** access
|
||||
5. Click **Create Bucket**
|
||||
6. Create application key:
|
||||
- Go to **App Keys** → **Add a New Application Key**
|
||||
- Name: `terraform-state`
|
||||
- Allow access to: `k8s-terraform-state` bucket only
|
||||
- Type: **Read and Write**
|
||||
- Copy **keyID** (access key) and **applicationKey** (secret key)
|
||||
7. Note your bucket's S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`)
|
||||
## Local Setup
|
||||
|
||||
### 3. SSH Key Pair
|
||||
|
||||
```bash
|
||||
ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s
|
||||
```
|
||||
|
||||
### 4. Local Tools
|
||||
|
||||
- [Terraform](https://terraform.io/downloads) >= 1.0
|
||||
- [Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html) >= 2.9
|
||||
- Python 3 with `jinja2` and `pyyaml`
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Clone Repository
|
||||
|
||||
```bash
|
||||
git clone <your-gitea-repo>/HetznerTerra.git
|
||||
cd HetznerTerra
|
||||
```
|
||||
|
||||
### 2. Configure Variables
|
||||
Create local variables from the example:
|
||||
|
||||
```bash
|
||||
cp terraform.tfvars.example terraform.tfvars
|
||||
```
|
||||
|
||||
Edit `terraform.tfvars`:
|
||||
Important defaults in `terraform.tfvars.example`:
|
||||
|
||||
```hcl
|
||||
hcloud_token = "your-hetzner-api-token"
|
||||
proxmox_endpoint = "https://100.105.0.115:8006/"
|
||||
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
|
||||
proxmox_api_token_secret = "your-proxmox-api-token-secret"
|
||||
|
||||
ssh_public_key = "~/.ssh/hetzner_k8s.pub"
|
||||
ssh_private_key = "~/.ssh/hetzner_k8s"
|
||||
ssh_public_key = "~/.ssh/infra.pub"
|
||||
ssh_private_key = "~/.ssh/infra"
|
||||
|
||||
s3_access_key = "your-backblaze-key-id"
|
||||
s3_secret_key = "your-backblaze-application-key"
|
||||
s3_endpoint = "https://s3.eu-central-003.backblazeb2.com"
|
||||
s3_bucket = "k8s-terraform-state"
|
||||
|
||||
tailscale_auth_key = "tskey-auth-..."
|
||||
tailscale_tailnet = "yourtailnet.ts.net"
|
||||
|
||||
restrict_api_ssh_to_tailnet = true
|
||||
tailnet_cidr = "100.64.0.0/10"
|
||||
enable_nodeport_public = false
|
||||
|
||||
allowed_ssh_ips = []
|
||||
allowed_api_ips = []
|
||||
tailscale_tailnet = "yourtailnet.ts.net"
|
||||
kube_api_vip = "10.27.27.40"
|
||||
```
|
||||
|
||||
### 3. Initialize Terraform
|
||||
Initialize Terraform with backend credentials:
|
||||
|
||||
```bash
|
||||
cd terraform
|
||||
|
||||
# Create backend config file (or use CLI args)
|
||||
cat > backend.hcl << EOF
|
||||
endpoint = "https://s3.eu-central-003.backblazeb2.com"
|
||||
bucket = "k8s-terraform-state"
|
||||
access_key = "your-backblaze-key-id"
|
||||
secret_key = "your-backblaze-application-key"
|
||||
skip_requesting_account_id = true
|
||||
EOF
|
||||
|
||||
terraform init -backend-config=backend.hcl
|
||||
terraform -chdir=terraform init \
|
||||
-backend-config="endpoint=<s3-endpoint>" \
|
||||
-backend-config="bucket=<s3-bucket>" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=<s3-access-key>" \
|
||||
-backend-config="secret_key=<s3-secret-key>" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
```
|
||||
|
||||
### 4. Plan and Apply
|
||||
## Common Commands
|
||||
|
||||
Terraform:
|
||||
|
||||
```bash
|
||||
terraform plan -var-file=../terraform.tfvars
|
||||
terraform apply -var-file=../terraform.tfvars
|
||||
terraform -chdir=terraform fmt -recursive
|
||||
terraform -chdir=terraform validate
|
||||
terraform -chdir=terraform plan -var-file=../terraform.tfvars
|
||||
terraform -chdir=terraform apply -var-file=../terraform.tfvars
|
||||
```
|
||||
|
||||
### 5. Generate Ansible Inventory
|
||||
Ansible setup:
|
||||
|
||||
```bash
|
||||
cd ../ansible
|
||||
ansible-galaxy collection install -r ansible/requirements.yml
|
||||
cd ansible
|
||||
python3 generate_inventory.py
|
||||
ansible-playbook site.yml --syntax-check
|
||||
```
|
||||
|
||||
### 6. Bootstrap Cluster
|
||||
Manual Ansible bootstrap uses the same extra vars as the deploy workflow:
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml
|
||||
cd ansible
|
||||
ansible-playbook site.yml \
|
||||
-e "tailscale_auth_key=$TAILSCALE_AUTH_KEY" \
|
||||
-e "tailscale_tailnet=$TAILSCALE_TAILNET" \
|
||||
-e "tailscale_oauth_client_id=$TAILSCALE_OAUTH_CLIENT_ID" \
|
||||
-e "tailscale_oauth_client_secret=$TAILSCALE_OAUTH_CLIENT_SECRET" \
|
||||
-e "doppler_hetznerterra_service_token=$DOPPLER_HETZNERTERRA_SERVICE_TOKEN" \
|
||||
-e "tailscale_api_key=${TAILSCALE_API_KEY:-}" \
|
||||
-e "grafana_admin_password=${GRAFANA_ADMIN_PASSWORD:-}" \
|
||||
-e "cluster_name=k8s-cluster"
|
||||
```
|
||||
|
||||
### 7. Get Kubeconfig
|
||||
Flux/Kustomize verification:
|
||||
|
||||
```bash
|
||||
kubectl kustomize infrastructure/addons/<addon>
|
||||
kubectl kustomize infrastructure/addons
|
||||
kubectl kustomize clusters/prod/flux-system
|
||||
```
|
||||
|
||||
Refresh kubeconfig after rebuilds:
|
||||
|
||||
```bash
|
||||
scripts/refresh-kubeconfig.sh 10.27.27.30
|
||||
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
Kubeconfig endpoint is rewritten to the primary control-plane tailnet hostname (`k8s-cluster-cp-1.<your-tailnet>`).
|
||||
Run the tailnet smoke check from cp1:
|
||||
|
||||
```bash
|
||||
ssh ubuntu@10.27.27.30 'bash -s' < scripts/smoke-check-tailnet-services.sh
|
||||
```
|
||||
|
||||
## Gitea CI/CD
|
||||
|
||||
This repository includes Gitea workflows for:
|
||||
The supported full rebuild path is the Gitea deploy workflow.
|
||||
|
||||
- **terraform-plan**: Runs on PRs, shows planned changes
|
||||
- **terraform-apply**: Runs on main branch after merge
|
||||
- **ansible-deploy**: Runs after terraform apply
|
||||
| Workflow | Trigger | Purpose |
|
||||
|----------|---------|---------|
|
||||
| `.gitea/workflows/deploy.yml` | PR to `main`, push to `main`, manual dispatch | PRs run Terraform plan; pushes run Terraform apply, Ansible bootstrap, Flux bootstrap, addon gates, health checks, and tailnet smoke checks |
|
||||
| `.gitea/workflows/destroy.yml` | Manual dispatch with `confirm: destroy` | Terraform destroy with retries; no Rancher backup gate |
|
||||
| `.gitea/workflows/dashboards.yml` | Grafana content changes or manual dispatch | Fast Grafana datasource/dashboard update through `ansible/dashboards.yml` |
|
||||
|
||||
### Required Gitea Secrets
|
||||
Deploy and destroy share `concurrency.group: prod-cluster` so they do not run at the same time.
|
||||
|
||||
Set these in your Gitea repository settings (**Settings** → **Secrets** → **Actions**):
|
||||
Deploy sequence on push to `main`:
|
||||
|
||||
1. Terraform fmt/init/validate/plan/apply.
|
||||
2. Cleanup/retry around known transient Proxmox clone and disk-update failures.
|
||||
3. Generate Ansible inventory from Terraform outputs.
|
||||
4. Prepare critical image archives with `skopeo` on the runner.
|
||||
5. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig.
|
||||
6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph.
|
||||
7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability.
|
||||
8. Run post-deploy health checks and Tailscale service smoke checks.
|
||||
|
||||
Required Gitea secrets:
|
||||
|
||||
| Secret | Description |
|
||||
|--------|-------------|
|
||||
| `HCLOUD_TOKEN` | Hetzner Cloud API token |
|
||||
| `S3_ACCESS_KEY` | Backblaze B2 keyID |
|
||||
| `S3_SECRET_KEY` | Backblaze B2 applicationKey |
|
||||
| `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) |
|
||||
| `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
|
||||
| `PROXMOX_ENDPOINT` | Proxmox API endpoint, for example `https://100.105.0.115:8006/` |
|
||||
| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID |
|
||||
| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret |
|
||||
| `S3_ACCESS_KEY` | S3/Backblaze access key for Terraform state |
|
||||
| `S3_SECRET_KEY` | S3/Backblaze secret key for Terraform state |
|
||||
| `S3_ENDPOINT` | S3 endpoint, for example `https://s3.eu-central-003.backblazeb2.com` |
|
||||
| `S3_BUCKET` | Terraform state bucket, for example `k8s-terraform-state` |
|
||||
| `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
|
||||
| `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
|
||||
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
|
||||
| `TAILSCALE_TAILNET` | Tailnet domain, for example `silverside-gopher.ts.net` |
|
||||
| `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for the Kubernetes operator |
|
||||
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for the Kubernetes operator |
|
||||
| `TAILSCALE_API_KEY` | Optional API key used to delete stale offline reserved devices before service proxies exist |
|
||||
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for runtime cluster secrets |
|
||||
| `GRAFANA_ADMIN_PASSWORD` | Optional Grafana admin password |
|
||||
| `SSH_PUBLIC_KEY` | SSH public key content |
|
||||
| `SSH_PRIVATE_KEY` | SSH private key content |
|
||||
|
||||
## File Structure
|
||||
## GitOps Graph
|
||||
|
||||
```
|
||||
.
|
||||
├── terraform/
|
||||
│ ├── main.tf
|
||||
│ ├── variables.tf
|
||||
│ ├── network.tf
|
||||
│ ├── firewall.tf
|
||||
│ ├── ssh.tf
|
||||
│ ├── servers.tf
|
||||
│ ├── outputs.tf
|
||||
│ └── backend.tf
|
||||
├── ansible/
|
||||
│ ├── inventory.tmpl
|
||||
│ ├── generate_inventory.py
|
||||
│ ├── site.yml
|
||||
│ ├── roles/
|
||||
│ │ ├── common/
|
||||
│ │ ├── k3s-server/
|
||||
│ │ ├── k3s-agent/
|
||||
│ │ ├── ccm/
|
||||
│ │ └── csi/
|
||||
│ └── ansible.cfg
|
||||
├── .gitea/
|
||||
│ └── workflows/
|
||||
│ ├── terraform.yml
|
||||
│ └── ansible.yml
|
||||
├── outputs/
|
||||
├── terraform.tfvars.example
|
||||
└── README.md
|
||||
Flux entrypoint:
|
||||
|
||||
```text
|
||||
clusters/prod/flux-system/
|
||||
├── gotk-components.yaml
|
||||
├── gitrepository-platform.yaml
|
||||
├── kustomization-infrastructure.yaml
|
||||
└── kustomization-apps.yaml # suspend: true
|
||||
```
|
||||
|
||||
## Firewall Rules
|
||||
Active infrastructure addons from `infrastructure/addons/kustomization.yaml`:
|
||||
|
||||
| Port | Source | Purpose |
|
||||
|------|--------|---------|
|
||||
| 22 | Tailnet CIDR | SSH |
|
||||
| 6443 | Tailnet CIDR + internal | Kubernetes API |
|
||||
| 41641/udp | Any | Tailscale WireGuard |
|
||||
| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) |
|
||||
| 2379 | 10.0.0.0/16 | etcd Client |
|
||||
| 2380 | 10.0.0.0/16 | etcd Peer |
|
||||
| 8472 | 10.0.0.0/16 | Flannel VXLAN |
|
||||
| 10250 | 10.0.0.0/16 | Kubelet |
|
||||
| 30000-32767 | Optional | NodePorts (disabled by default) |
|
||||
- `addon-nfs-storage`
|
||||
- `addon-external-secrets`
|
||||
- `addon-cert-manager`
|
||||
- `addon-tailscale-operator`
|
||||
- `addon-tailscale-proxyclass`
|
||||
- `traefik` HelmRelease manifests applied directly by the top-level infrastructure Kustomization
|
||||
- `addon-observability`
|
||||
- `addon-observability-content`
|
||||
- `addon-rancher`
|
||||
- `addon-rancher-config`
|
||||
|
||||
Chart/source strategy:
|
||||
|
||||
- Vendored charts are intentional: `cert-manager`, `traefik`, `kube-prometheus-stack`, `tailscale-operator`, and `rancher` live under `infrastructure/charts/`.
|
||||
- External Secrets, Loki, and Promtail use Flux `OCIRepository` sources.
|
||||
- NFS storage is raw Kubernetes manifests, not a Helm chart.
|
||||
- Rancher backup/restore is not part of the current live graph.
|
||||
|
||||
Doppler bootstrap details:
|
||||
|
||||
- `ansible/roles/doppler-bootstrap` creates the `external-secrets` namespace and the Doppler token secret only.
|
||||
- The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
|
||||
- The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by the addon kustomization.
|
||||
|
||||
## Access URLs
|
||||
|
||||
| Service | URL |
|
||||
|---------|-----|
|
||||
| Rancher | `https://rancher.silverside-gopher.ts.net/` |
|
||||
| Grafana | `http://grafana.silverside-gopher.ts.net/` |
|
||||
| Prometheus | `http://prometheus.silverside-gopher.ts.net:9090/` |
|
||||
|
||||
Fallback port-forward from a tailnet-connected machine:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||
kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
|
||||
kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
|
||||
```
|
||||
|
||||
Grafana user is `admin`; password comes from the `GRAFANA_ADMIN_PASSWORD` Doppler secret or the workflow-provided fallback.
|
||||
|
||||
## Operations
|
||||
|
||||
### Scale Workers
|
||||
Scale workers by updating `terraform.tfvars` counts, IP lists, and VMID lists together. If node names or VMIDs change, also update the hard-coded retry cleanup target map in `.gitea/workflows/deploy.yml`.
|
||||
|
||||
Edit `terraform.tfvars`:
|
||||
Upgrade K3s by changing the role defaults in `ansible/roles/k3s-server/defaults/main.yml` and `ansible/roles/k3s-agent/defaults/main.yml`. Check Rancher chart compatibility before moving to a Kubernetes minor outside `<1.35.0-0`.
|
||||
|
||||
```hcl
|
||||
worker_count = 5
|
||||
```
|
||||
|
||||
Then:
|
||||
Destroy through the Gitea `Destroy` workflow with `confirm: destroy`, or locally with:
|
||||
|
||||
```bash
|
||||
terraform apply
|
||||
ansible-playbook site.yml
|
||||
```
|
||||
|
||||
### Upgrade k3s
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml -t upgrade
|
||||
```
|
||||
|
||||
### Destroy Cluster
|
||||
|
||||
```bash
|
||||
terraform destroy
|
||||
terraform -chdir=terraform destroy -var-file=../terraform.tfvars
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Check k3s Logs
|
||||
Check K3s from cp1:
|
||||
|
||||
```bash
|
||||
ssh root@<control-plane-ip> journalctl -u k3s -f
|
||||
ssh ubuntu@10.27.27.30 'sudo k3s kubectl get nodes -o wide'
|
||||
ssh ubuntu@10.27.27.30 'sudo journalctl -u k3s -n 120 --no-pager'
|
||||
```
|
||||
|
||||
### Reset k3s
|
||||
Check Flux and Rancher:
|
||||
|
||||
```bash
|
||||
ansible-playbook site.yml -t reset
|
||||
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
|
||||
kubectl -n flux-system describe helmrelease rancher
|
||||
kubectl -n cattle-system get pods,deploy -o wide
|
||||
```
|
||||
|
||||
## Costs Breakdown
|
||||
Check Tailscale services:
|
||||
|
||||
| Resource | Quantity | Unit Price | Monthly |
|
||||
|----------|----------|------------|---------|
|
||||
| CX23 (Control Plane) | 3 | €2.99 | €8.97 |
|
||||
| CX33 (Workers) | 4 | €4.99 | €19.96 |
|
||||
| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 |
|
||||
| **Total** | | | **€28.93/mo** |
|
||||
```bash
|
||||
kubectl -n tailscale-system get pods
|
||||
kubectl -n cattle-system get svc rancher-tailscale
|
||||
kubectl -n observability get svc grafana-tailscale prometheus-tailscale
|
||||
kubectl -n cattle-system describe svc rancher-tailscale | grep TailscaleProxyReady
|
||||
kubectl -n observability describe svc grafana-tailscale | grep TailscaleProxyReady
|
||||
kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxyReady
|
||||
```
|
||||
|
||||
If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`.
|
||||
|
||||
## Security Notes
|
||||
|
||||
- Control plane has HA (3 nodes, can survive 1 failure)
|
||||
- Consider adding Hetzner load balancer for API server
|
||||
- Rotate API tokens regularly
|
||||
- Use network policies in Kubernetes
|
||||
- Enable audit logging for production
|
||||
- Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values.
|
||||
- Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
|
||||
- Runtime cluster secrets are sourced from Doppler through External Secrets.
|
||||
- This repo does not manage Proxmox/LAN firewalls or public ingress.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
# Gitea Secrets Setup
|
||||
|
||||
This document describes the secrets required for the Proxmox-based deployment workflow.
|
||||
|
||||
## Required Secrets
|
||||
|
||||
Add these secrets in your Gitea repository settings:
|
||||
**Settings → Secrets → Actions**
|
||||
|
||||
### Infrastructure Secrets
|
||||
|
||||
#### `PROXMOX_ENDPOINT`
|
||||
- Proxmox VE API endpoint
|
||||
- Example: `https://100.105.0.115:8006/`
|
||||
|
||||
#### `PROXMOX_API_TOKEN_ID`
|
||||
- Proxmox API token ID
|
||||
- Example: `terraform-prov@pve!k8s-cluster`
|
||||
|
||||
#### `PROXMOX_API_TOKEN_SECRET`
|
||||
- Proxmox API token secret
|
||||
- Create with `pveum user token add terraform-prov@pve k8s-cluster`
|
||||
|
||||
#### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
|
||||
- Backblaze B2 credentials for Terraform state storage
|
||||
- Get from: https://secure.backblaze.com/b2_buckets.htm
|
||||
- Create application key with access to your terraform state bucket
|
||||
|
||||
#### `S3_ENDPOINT`
|
||||
- Backblaze B2 S3 endpoint
|
||||
- Example: `https://s3.eu-central-003.backblazeb2.com`
|
||||
|
||||
#### `S3_BUCKET`
|
||||
- Backblaze B2 bucket name for Terraform state
|
||||
- Example: `k8s-terraform-state`
|
||||
|
||||
### SSH Secrets
|
||||
|
||||
#### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
|
||||
- SSH key pair for cluster access
|
||||
- Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra`
|
||||
- Private key content (include BEGIN/END lines)
|
||||
- Public key content (full line starting with ssh-ed25519)
|
||||
|
||||
### Tailscale Secrets
|
||||
|
||||
#### `TAILSCALE_AUTH_KEY`
|
||||
- Tailscale auth key for node registration
|
||||
- Get from: https://login.tailscale.com/admin/settings/keys
|
||||
- Type: Reusable, Ephemeral
|
||||
- Scope: `devices:core:write`
|
||||
|
||||
#### `TAILSCALE_TAILNET`
|
||||
- Your Tailscale network name
|
||||
- Example: `tail7ec33.ts.net` or your custom domain
|
||||
|
||||
#### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
|
||||
- OAuth credentials for Tailscale Kubernetes Operator
|
||||
- Get from: https://login.tailscale.com/admin/settings/oauth
|
||||
- Create OAuth client with scope: `devices:core:write`
|
||||
|
||||
### Application Secrets
|
||||
|
||||
#### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
|
||||
- Doppler service token for the `hetznerterra` project runtime secrets
|
||||
- Used by External Secrets Operator bootstrap
|
||||
- Recommended scope: `hetznerterra` project, `prod` config only
|
||||
|
||||
#### `GRAFANA_ADMIN_PASSWORD`
|
||||
- Transitional fallback only while migrating observability secrets to Doppler
|
||||
- In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
|
||||
|
||||
## Setting Up Secrets
|
||||
|
||||
1. Go to your Gitea repository
|
||||
2. Navigate to **Settings → Secrets → Actions**
|
||||
3. Click **Add Secret**
|
||||
4. Enter the secret name (exact match from above)
|
||||
5. Paste the secret value
|
||||
6. Click **Add Secret**
|
||||
7. Repeat for all secrets
|
||||
|
||||
## Verification
|
||||
|
||||
After adding all secrets, trigger a workflow run:
|
||||
```bash
|
||||
git commit --allow-empty -m "ci: trigger workflow with new secrets"
|
||||
git push
|
||||
```
|
||||
|
||||
Check the workflow logs to verify all secrets are being used correctly.
|
||||
|
||||
## Security Notes
|
||||
|
||||
- Never commit secrets to the repository
|
||||
- Use strong, unique passwords for Grafana and other services
|
||||
- Prefer Doppler for runtime app/platform secrets after cluster bootstrap
|
||||
- Rotate Tailscale auth keys periodically
|
||||
- Review OAuth client permissions regularly
|
||||
- CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access
|
||||
@@ -0,0 +1,73 @@
|
||||
# Stable Private-Only Baseline
|
||||
|
||||
This document defines the current engineering target for this repository.
|
||||
|
||||
## Topology
|
||||
|
||||
- 3 control planes (HA etcd cluster)
|
||||
- 5 workers
|
||||
- kube-vip API VIP (`10.27.27.40`)
|
||||
- private Proxmox/LAN network (`10.27.27.0/24`)
|
||||
- Tailscale operator access and service exposure
|
||||
- Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
|
||||
- Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
|
||||
- Prometheus exposed through Tailscale (`prometheus.silverside-gopher.ts.net:9090`)
|
||||
- `apps` Kustomization suspended by default
|
||||
|
||||
## In Scope
|
||||
|
||||
- Terraform infrastructure bootstrap
|
||||
- Ansible k3s bootstrap on Ubuntu cloud-init VMs
|
||||
- **HA control plane (3 nodes with etcd quorum)**
|
||||
- **kube-vip for Kubernetes API HA**
|
||||
- **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`**
|
||||
- Flux core reconciliation
|
||||
- External Secrets Operator with Doppler
|
||||
- Tailscale private access and smoke-check validation
|
||||
- cert-manager
|
||||
- Rancher and rancher-backup
|
||||
- Rancher backup/restore validation
|
||||
- Observability stack (Grafana, Prometheus, Loki, Promtail)
|
||||
- Persistent volume provisioning validated
|
||||
|
||||
## Deferred for Later Phases
|
||||
|
||||
- app workloads in `apps/`
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- public ingress or DNS
|
||||
- public TLS
|
||||
- app workloads
|
||||
- cross-region / multi-cluster disaster recovery strategy
|
||||
- upgrade strategy
|
||||
|
||||
## Phase Gates
|
||||
|
||||
1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP).
|
||||
2. Primary control plane bootstraps with `--cluster-init`.
|
||||
3. kube-vip advertises `10.27.27.40:6443` from the control-plane set.
|
||||
4. Secondary control planes join via the kube-vip endpoint.
|
||||
5. Workers join successfully via the kube-vip endpoint.
|
||||
7. etcd reports 3 healthy members.
|
||||
8. Flux source and infrastructure reconciliation are healthy.
|
||||
9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**.
|
||||
10. **PVC provisioning tested and working**.
|
||||
11. External Secrets sync required secrets.
|
||||
12. Tailscale private access works for Rancher, Grafana, and Prometheus.
|
||||
13. CI smoke checks pass for Tailscale DNS resolution, `tailscale ping`, and HTTP reachability.
|
||||
14. A fresh Rancher backup can be created and restored successfully.
|
||||
15. Terraform destroy succeeds cleanly or via workflow retry.
|
||||
|
||||
## Success Criteria
|
||||
|
||||
Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes, no manual `kubectl` patching, and no manual Tailscale proxy recreation.
|
||||
|
||||
## Validated Drills
|
||||
|
||||
- 2026-04-18: live Rancher backup/restore drill succeeded on the current cluster.
|
||||
- A fresh one-time backup was created, restored back onto the same cluster, and post-restore validation confirmed:
|
||||
- all nodes remained `Ready`
|
||||
- Flux infrastructure stayed healthy
|
||||
- Rancher backup/restore resources reported `Completed`
|
||||
- Rancher, Grafana, and Prometheus remained reachable through the Tailscale smoke checks
|
||||
+2
-1
@@ -3,7 +3,8 @@ inventory = inventory.ini
|
||||
host_key_checking = False
|
||||
retry_files_enabled = False
|
||||
roles_path = roles
|
||||
stdout_callback = yaml
|
||||
stdout_callback = default
|
||||
result_format = yaml
|
||||
interpreter_python = auto_silent
|
||||
|
||||
[privilege_escalation]
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
- name: Provision Grafana dashboards and datasources
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- observability-content
|
||||
@@ -32,6 +32,7 @@ def main():
|
||||
worker_names = outputs["worker_names"]["value"]
|
||||
worker_ips = outputs["worker_ips"]["value"]
|
||||
worker_private_ips = outputs["worker_private_ips"]["value"]
|
||||
kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])
|
||||
|
||||
control_planes = [
|
||||
{
|
||||
@@ -59,6 +60,7 @@ def main():
|
||||
"control_planes": control_planes,
|
||||
"workers": workers,
|
||||
"private_key_file": outputs["ssh_private_key_path"]["value"],
|
||||
"kube_api_lb_ip": kube_api_lb_ip,
|
||||
}
|
||||
|
||||
env = Environment(loader=FileSystemLoader("."))
|
||||
|
||||
@@ -13,7 +13,7 @@ control_plane
|
||||
workers
|
||||
|
||||
[cluster:vars]
|
||||
ansible_user=root
|
||||
ansible_user=ubuntu
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
ansible_ssh_private_key_file={{ private_key_file }}
|
||||
k3s_version=latest
|
||||
kube_api_endpoint={{ kube_api_lb_ip }}
|
||||
|
||||
@@ -3,3 +3,5 @@ collections:
|
||||
version: ">=2.4.0"
|
||||
- name: community.general
|
||||
version: ">=8.0.0"
|
||||
- name: community.network
|
||||
version: ">=5.0.0"
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
- name: Ensure Tailscale operator namespace exists
|
||||
command: >-
|
||||
kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
|
||||
--dry-run=client -o yaml
|
||||
register: tailscale_namespace_manifest
|
||||
changed_when: false
|
||||
when:
|
||||
- tailscale_oauth_client_id | default('') | length > 0
|
||||
- tailscale_oauth_client_secret | default('') | length > 0
|
||||
|
||||
- name: Apply Tailscale operator namespace
|
||||
command: kubectl apply -f -
|
||||
args:
|
||||
stdin: "{{ tailscale_namespace_manifest.stdout }}"
|
||||
changed_when: true
|
||||
when:
|
||||
- tailscale_oauth_client_id | default('') | length > 0
|
||||
- tailscale_oauth_client_secret | default('') | length > 0
|
||||
|
||||
- name: Apply Tailscale operator OAuth secret
|
||||
shell: >-
|
||||
kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
|
||||
--from-literal=client_id='{{ tailscale_oauth_client_id }}'
|
||||
--from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
changed_when: true
|
||||
no_log: true
|
||||
when:
|
||||
- tailscale_oauth_client_id | default('') | length > 0
|
||||
- tailscale_oauth_client_secret | default('') | length > 0
|
||||
@@ -0,0 +1,12 @@
|
||||
---
|
||||
bootstrap_prepull_images:
|
||||
- docker.io/rancher/mirrored-pause:3.6
|
||||
- docker.io/rancher/mirrored-coredns-coredns:1.14.2
|
||||
- docker.io/rancher/mirrored-metrics-server:v0.8.1
|
||||
- docker.io/rancher/local-path-provisioner:v0.0.35
|
||||
- docker.io/rancher/mirrored-library-traefik:3.6.10
|
||||
- docker.io/rancher/klipper-helm:v0.9.14-build20260309
|
||||
- ghcr.io/fluxcd/source-controller:v1.8.0
|
||||
- ghcr.io/fluxcd/kustomize-controller:v1.8.1
|
||||
- ghcr.io/fluxcd/helm-controller:v1.5.1
|
||||
- ghcr.io/fluxcd/notification-controller:v1.8.1
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Check for runner-provided bootstrap image archives
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: bootstrap_image_archive_stats
|
||||
loop: "{{ bootstrap_prepull_images }}"
|
||||
|
||||
- name: Ensure remote bootstrap image archive directory exists
|
||||
file:
|
||||
path: /tmp/bootstrap-image-archives
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: Copy runner-provided bootstrap image archives
|
||||
copy:
|
||||
src: "{{ item.stat.path }}"
|
||||
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
|
||||
mode: "0644"
|
||||
loop: "{{ bootstrap_image_archive_stats.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.stat.exists
|
||||
|
||||
- name: Import or pull bootstrap images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
if [ -s "${archive}" ]; then
|
||||
for attempt in 1 2 3; do
|
||||
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "imported image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: bootstrap_image_pull
|
||||
loop: "{{ bootstrap_prepull_images }}"
|
||||
changed_when: "'imported image' in bootstrap_image_pull.stdout or 'pulled image' in bootstrap_image_pull.stdout"
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
hcloud_token: ""
|
||||
cluster_name: "k8s-cluster"
|
||||
hcloud_lb_location: "nbg1"
|
||||
@@ -1,88 +0,0 @@
|
||||
---
|
||||
- name: Check if Hetzner CCM is already deployed
|
||||
command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
|
||||
register: ccm_namespace
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Create Hetzner cloud secret
|
||||
shell: |
|
||||
kubectl -n kube-system create secret generic hcloud \
|
||||
--from-literal=token='{{ hcloud_token }}' \
|
||||
--from-literal=network='{{ cluster_name }}-network' \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
no_log: true
|
||||
when: hcloud_token is defined
|
||||
changed_when: true
|
||||
|
||||
- name: Deploy Hetzner CCM
|
||||
command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Detect CCM workload kind
|
||||
shell: |
|
||||
if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
|
||||
echo deployment
|
||||
elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
|
||||
echo daemonset
|
||||
else
|
||||
echo missing
|
||||
fi
|
||||
register: ccm_workload_kind
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for CCM deployment rollout
|
||||
command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
|
||||
register: ccm_rollout_deploy
|
||||
until: ccm_rollout_deploy.rc == 0
|
||||
changed_when: false
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: ccm_workload_kind.stdout == "deployment"
|
||||
|
||||
- name: Wait for CCM daemonset rollout
|
||||
command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
|
||||
register: ccm_rollout_ds
|
||||
until: ccm_rollout_ds.rc == 0
|
||||
changed_when: false
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: ccm_workload_kind.stdout == "daemonset"
|
||||
|
||||
- name: Set default Hetzner load balancer location for Traefik service
|
||||
command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
|
||||
register: traefik_annotation
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Show Traefik service when annotation patch fails
|
||||
command: kubectl -n kube-system get service traefik -o yaml
|
||||
register: traefik_service_dump
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: traefik_annotation.rc != 0
|
||||
|
||||
- name: Fail when Traefik load balancer annotation cannot be set
|
||||
fail:
|
||||
msg: |
|
||||
Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
|
||||
Command output:
|
||||
{{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
|
||||
|
||||
Service dump:
|
||||
{{ traefik_service_dump.stdout | default('n/a') }}
|
||||
when: traefik_annotation.rc != 0
|
||||
|
||||
- name: Show CCM namespace objects when workload missing
|
||||
command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
|
||||
register: ccm_ns_objects
|
||||
changed_when: false
|
||||
when: ccm_workload_kind.stdout == "missing"
|
||||
|
||||
- name: Fail when CCM workload is missing
|
||||
fail:
|
||||
msg: |
|
||||
hcloud-cloud-controller-manager workload not found after applying manifest.
|
||||
Namespace objects:
|
||||
{{ ccm_ns_objects.stdout | default('n/a') }}
|
||||
when: ccm_workload_kind.stdout == "missing"
|
||||
@@ -1,12 +1,32 @@
|
||||
---
|
||||
- name: Check if cloud-init is installed
|
||||
command: which cloud-init
|
||||
register: cloud_init_binary
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Wait for cloud-init to finish first-boot tasks
|
||||
command: cloud-init status --wait
|
||||
register: cloud_init_wait
|
||||
changed_when: false
|
||||
failed_when: >-
|
||||
cloud_init_wait.rc not in [0, 2] or
|
||||
(
|
||||
'status: done' not in cloud_init_wait.stdout and
|
||||
'status: disabled' not in cloud_init_wait.stdout
|
||||
)
|
||||
when: cloud_init_binary.rc == 0
|
||||
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
lock_timeout: 600
|
||||
|
||||
- name: Upgrade packages
|
||||
apt:
|
||||
upgrade: dist
|
||||
lock_timeout: 600
|
||||
when: common_upgrade_packages | default(false)
|
||||
|
||||
- name: Install required packages
|
||||
@@ -19,18 +39,27 @@
|
||||
- lsb-release
|
||||
- software-properties-common
|
||||
- jq
|
||||
- nfs-common
|
||||
- htop
|
||||
- vim
|
||||
state: present
|
||||
lock_timeout: 600
|
||||
|
||||
- name: Check active swap
|
||||
command: swapon --noheadings
|
||||
register: active_swap
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Disable swap
|
||||
command: swapoff -a
|
||||
changed_when: true
|
||||
when: active_swap.stdout | trim | length > 0
|
||||
|
||||
- name: Remove swap from fstab
|
||||
mount:
|
||||
name: swap
|
||||
fstype: swap
|
||||
lineinfile:
|
||||
path: /etc/fstab
|
||||
regexp: '^\s*[^#]\S+\s+\S+\s+swap\s+.*$'
|
||||
state: absent
|
||||
|
||||
- name: Load br_netfilter module
|
||||
@@ -66,6 +95,10 @@
|
||||
|
||||
- name: Install tailscale
|
||||
shell: curl -fsSL https://tailscale.com/install.sh | sh
|
||||
register: tailscale_install
|
||||
until: tailscale_install.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_binary.rc != 0
|
||||
@@ -78,9 +111,22 @@
|
||||
failed_when: false
|
||||
when: tailscale_auth_key | length > 0
|
||||
|
||||
- name: Connect node to tailnet
|
||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||
- name: Parse tailscale connection state
|
||||
set_fact:
|
||||
tailscale_backend_state: "{{ (tailscale_status.stdout | from_json).BackendState | default('') }}"
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_status.rc != 0 or '"BackendState":"Running"' not in tailscale_status.stdout
|
||||
- tailscale_status.rc == 0
|
||||
- tailscale_status.stdout | length > 0
|
||||
|
||||
- name: Connect node to tailnet
|
||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||
register: tailscale_up
|
||||
until: tailscale_up.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
no_log: true
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
|
||||
changed_when: true
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
---
|
||||
hcloud_token: ""
|
||||
cluster_name: "k8s-cluster"
|
||||
csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
|
||||
csi_rollout_timeout_seconds: 30
|
||||
csi_rollout_retries: 8
|
||||
csi_rollout_delay_seconds: 5
|
||||
csi_failure_log_tail_lines: 120
|
||||
csi_smoke_test_enabled: true
|
||||
csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
|
||||
csi_smoke_test_base_storage_class: "hcloud-volumes"
|
||||
csi_smoke_test_size: "1Gi"
|
||||
csi_smoke_test_pvc_timeout_seconds: 300
|
||||
csi_smoke_test_job_timeout_seconds: 300
|
||||
csi_smoke_test_required: false
|
||||
@@ -1,425 +0,0 @@
|
||||
---
|
||||
- name: Create Hetzner CSI secret
|
||||
shell: |
|
||||
kubectl -n kube-system create secret generic hcloud \
|
||||
--from-literal=token='{{ hcloud_token }}' \
|
||||
--from-literal=network='{{ cluster_name }}-network' \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
no_log: true
|
||||
when: hcloud_token is defined
|
||||
changed_when: true
|
||||
|
||||
- name: Deploy Hetzner CSI
|
||||
command: kubectl apply -f {{ csi_manifest_url }}
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure CSI controller endpoint is set for sidecars
|
||||
command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure CSI node endpoint is set for sidecars
|
||||
command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
|
||||
changed_when: true
|
||||
|
||||
- name: Restart CSI controller to pick up current secret
|
||||
command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for CSI controller deployment generation
|
||||
command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for CSI controller rollout
|
||||
command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
|
||||
register: csi_controller_rollout
|
||||
until: csi_controller_rollout.rc == 0
|
||||
retries: "{{ csi_rollout_retries | int }}"
|
||||
delay: "{{ csi_rollout_delay_seconds | int }}"
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Show CSI controller status on failure
|
||||
command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
|
||||
register: csi_controller_deploy_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Show CSI controller pods on failure
|
||||
command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
|
||||
register: csi_controller_pods_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Describe CSI controller deployment on failure
|
||||
command: kubectl -n kube-system describe deployment hcloud-csi-controller
|
||||
register: csi_controller_deploy_describe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Describe CSI controller pod on failure
|
||||
shell: |
|
||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
||||
if [ -n "$pod" ]; then
|
||||
kubectl -n kube-system describe pod "$pod"
|
||||
fi
|
||||
register: csi_controller_pod_describe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Show CSI driver logs on failure
|
||||
command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
|
||||
register: csi_driver_logs
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Show CSI driver previous logs on failure
|
||||
shell: |
|
||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
||||
if [ -n "$pod" ]; then
|
||||
kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
|
||||
fi
|
||||
register: csi_driver_previous_logs
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Show sidecar previous logs on failure
|
||||
shell: |
|
||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
||||
if [ -n "$pod" ]; then
|
||||
for container in csi-attacher csi-resizer csi-provisioner; do
|
||||
echo "===== $container ====="
|
||||
kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
|
||||
done
|
||||
fi
|
||||
register: csi_sidecar_previous_logs
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Show recent kube-system events on failure
|
||||
command: kubectl -n kube-system get events --sort-by=.lastTimestamp
|
||||
register: csi_recent_events
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Fail with CSI controller diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
CSI controller rollout failed.
|
||||
Deployment status:
|
||||
{{ csi_controller_deploy_status.stdout | default('n/a') }}
|
||||
|
||||
Pods status:
|
||||
{{ csi_controller_pods_status.stdout | default('n/a') }}
|
||||
|
||||
Deployment describe:
|
||||
{{ csi_controller_deploy_describe.stdout | default('n/a') }}
|
||||
|
||||
Pod describe:
|
||||
{{ csi_controller_pod_describe.stdout | default('n/a') }}
|
||||
|
||||
hcloud-csi-driver logs:
|
||||
{{ csi_driver_logs.stdout | default('n/a') }}
|
||||
|
||||
hcloud-csi-driver previous logs:
|
||||
{{ csi_driver_previous_logs.stdout | default('n/a') }}
|
||||
|
||||
Sidecar previous logs:
|
||||
{{ csi_sidecar_previous_logs.stdout | default('n/a') }}
|
||||
|
||||
Recent kube-system events:
|
||||
{{ csi_recent_events.stdout | default('n/a') }}
|
||||
when: csi_controller_rollout.rc != 0
|
||||
|
||||
- name: Wait for CSI node daemonset rollout
|
||||
command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
|
||||
register: csi_node_rollout
|
||||
until: csi_node_rollout.rc == 0
|
||||
retries: "{{ csi_rollout_retries | int }}"
|
||||
delay: "{{ csi_rollout_delay_seconds | int }}"
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Fail when CSI node daemonset rollout does not complete
|
||||
fail:
|
||||
msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
|
||||
when: csi_node_rollout.rc != 0
|
||||
|
||||
- name: Generate CSI smoke test run identifier
|
||||
set_fact:
|
||||
csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Generate unique CSI smoke test resource names
|
||||
set_fact:
|
||||
csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
|
||||
csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Cleanup stale CSI smoke test resources before apply
|
||||
shell: |
|
||||
kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
|
||||
kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Apply CSI smoke test resources
|
||||
shell: |
|
||||
kubectl apply -f - <<'EOF'
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: {{ csi_smoke_test_storage_class }}
|
||||
provisioner: csi.hetzner.cloud
|
||||
reclaimPolicy: Delete
|
||||
volumeBindingMode: Immediate
|
||||
allowVolumeExpansion: true
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ csi_smoke_test_pvc_name }}
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: csi-smoke
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ csi_smoke_test_size }}
|
||||
storageClassName: {{ csi_smoke_test_storage_class }}
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ csi_smoke_test_job_name }}
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: csi-smoke
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: write-and-read
|
||||
image: busybox:1.36
|
||||
command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ csi_smoke_test_pvc_name }}
|
||||
EOF
|
||||
changed_when: true
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Wait for CSI smoke PVC to bind
|
||||
command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
|
||||
register: csi_smoke_pvc_wait
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Wait for CSI smoke Job completion
|
||||
command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
|
||||
register: csi_smoke_job_wait
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc == 0
|
||||
|
||||
- name: Show CSI smoke job logs
|
||||
command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
|
||||
register: csi_smoke_job_logs
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when: csi_smoke_test_enabled | bool
|
||||
|
||||
- name: Show CSI smoke PVC on failure
|
||||
command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
|
||||
register: csi_smoke_pvc_status
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Show CSI smoke Job on failure
|
||||
command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
|
||||
register: csi_smoke_job_status
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Show CSI smoke pods on failure
|
||||
command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
|
||||
register: csi_smoke_pod_status
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Describe CSI smoke PVC on failure
|
||||
command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
|
||||
register: csi_smoke_pvc_describe
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Show storage classes on failure
|
||||
command: kubectl get storageclass
|
||||
register: csi_storageclasses
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Get CSI controller pod name on smoke failure
|
||||
shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
|
||||
register: csi_controller_pod_name
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Describe CSI controller pod on smoke failure
|
||||
command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
|
||||
register: csi_controller_pod_smoke_describe
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
- csi_controller_pod_name.stdout | length > 0
|
||||
|
||||
- name: Show CSI controller container logs on smoke failure
|
||||
shell: |
|
||||
pod="{{ csi_controller_pod_name.stdout }}"
|
||||
for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
|
||||
echo "===== ${container}: current ====="
|
||||
kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
|
||||
echo "===== ${container}: previous ====="
|
||||
kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
|
||||
done
|
||||
register: csi_controller_container_logs
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
- csi_controller_pod_name.stdout | length > 0
|
||||
|
||||
- name: Show CSI driver and node driver objects on smoke failure
|
||||
shell: |
|
||||
echo "===== CSIDriver ====="
|
||||
kubectl get csidriver csi.hetzner.cloud -o yaml || true
|
||||
echo "===== CSINode ====="
|
||||
kubectl get csinode -o wide || true
|
||||
register: csi_driver_objects
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Show CSI smoke pod describe on failure
|
||||
shell: |
|
||||
pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
||||
if [ -n "$pod" ]; then
|
||||
kubectl -n kube-system describe pod "$pod"
|
||||
fi
|
||||
register: csi_smoke_pod_describe
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
|
||||
- name: Fail when CSI smoke test fails
|
||||
fail:
|
||||
msg: |
|
||||
CSI smoke test failed.
|
||||
PVC wait:
|
||||
stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
|
||||
stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
|
||||
|
||||
Job wait:
|
||||
stdout: {{ csi_smoke_job_wait.stdout | default('') }}
|
||||
stderr: {{ csi_smoke_job_wait.stderr | default('') }}
|
||||
|
||||
PVC:
|
||||
{{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
|
||||
|
||||
Job:
|
||||
{{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
|
||||
|
||||
Pod list:
|
||||
{{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
|
||||
|
||||
PVC describe:
|
||||
{{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
|
||||
|
||||
Storage classes:
|
||||
{{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
|
||||
|
||||
CSI controller pod:
|
||||
{{ csi_controller_pod_name.stdout | default('n/a') }}
|
||||
|
||||
CSI controller pod describe:
|
||||
{{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
|
||||
|
||||
CSI controller container logs:
|
||||
{{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
|
||||
|
||||
CSI driver objects:
|
||||
{{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
|
||||
|
||||
Pod describe:
|
||||
{{ csi_smoke_pod_describe.stdout | default('n/a') }}
|
||||
|
||||
Job logs:
|
||||
{{ csi_smoke_job_logs.stdout | default('n/a') }}
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
- csi_smoke_test_required | bool
|
||||
|
||||
- name: Warn when CSI smoke test fails but is non-blocking
|
||||
debug:
|
||||
msg: |
|
||||
CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
|
||||
PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
|
||||
Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
|
||||
when:
|
||||
- csi_smoke_test_enabled | bool
|
||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
||||
- not (csi_smoke_test_required | bool)
|
||||
|
||||
- name: Cleanup CSI smoke test resources
|
||||
shell: |
|
||||
kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
|
||||
kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
when: csi_smoke_test_enabled | bool
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
- name: Ensure Doppler service token is provided
|
||||
assert:
|
||||
that:
|
||||
- doppler_hetznerterra_service_token | length > 0
|
||||
fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
|
||||
|
||||
- name: Ensure external-secrets namespace exists
|
||||
shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
|
||||
changed_when: true
|
||||
|
||||
- name: Apply Doppler service token secret
|
||||
shell: >-
|
||||
kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
|
||||
--from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
changed_when: true
|
||||
no_log: true
|
||||
|
||||
- name: Note pending Doppler ClusterSecretStore bootstrap
|
||||
debug:
|
||||
msg: >-
|
||||
Doppler service token secret is bootstrapped. The deploy workflow creates the
|
||||
ClusterSecretStore after External Secrets CRDs and webhook endpoints are ready.
|
||||
@@ -1,5 +1,7 @@
|
||||
---
|
||||
k3s_version: latest
|
||||
k3s_version: v1.34.6+k3s1
|
||||
k3s_server_url: ""
|
||||
k3s_token: ""
|
||||
k3s_node_ip: ""
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
|
||||
@@ -1,25 +1,67 @@
|
||||
---
|
||||
- name: Check if k3s agent is already installed
|
||||
- name: Check if k3s agent service exists
|
||||
stat:
|
||||
path: /usr/local/bin/k3s-agent
|
||||
register: k3s_agent_binary
|
||||
path: /etc/systemd/system/k3s-agent.service
|
||||
register: k3s_agent_service
|
||||
|
||||
- name: Check k3s agent service state
|
||||
command: systemctl is-active k3s-agent
|
||||
register: k3s_agent_service_state
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_agent_service.stat.exists
|
||||
|
||||
- name: Check installed k3s version
|
||||
command: k3s --version
|
||||
register: installed_k3s_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_agent_service.stat.exists
|
||||
|
||||
- name: Determine whether k3s agent install is needed
|
||||
set_fact:
|
||||
k3s_agent_install_needed: >-
|
||||
{{
|
||||
(not k3s_agent_service.stat.exists)
|
||||
or ((k3s_agent_service_state.stdout | default('')) != 'active')
|
||||
or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
|
||||
}}
|
||||
|
||||
- name: Download k3s install script
|
||||
get_url:
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
when: not k3s_agent_binary.stat.exists
|
||||
register: k3s_agent_install_script
|
||||
until: k3s_agent_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_agent_install_needed
|
||||
|
||||
- name: Install k3s agent
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
K3S_URL: "{{ k3s_server_url }}"
|
||||
K3S_TOKEN: "{{ k3s_token }}"
|
||||
command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
|
||||
args:
|
||||
creates: /usr/local/bin/k3s-agent
|
||||
when: not k3s_agent_binary.stat.exists
|
||||
when: k3s_agent_install_needed
|
||||
block:
|
||||
- name: Wait for Kubernetes API endpoint before agent join
|
||||
wait_for:
|
||||
host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 180
|
||||
|
||||
- name: Run k3s agent install
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
K3S_URL: "{{ k3s_server_url }}"
|
||||
K3S_TOKEN: "{{ k3s_token }}"
|
||||
command: >-
|
||||
/tmp/install-k3s.sh agent
|
||||
--node-ip {{ k3s_node_ip }}
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: k3s_agent_install
|
||||
until: k3s_agent_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s agent to be ready
|
||||
command: systemctl is-active k3s-agent
|
||||
@@ -28,3 +70,34 @@
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Show k3s-agent service status on failure
|
||||
command: systemctl status k3s-agent --no-pager
|
||||
register: k3s_agent_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: agent_status is failed
|
||||
|
||||
- name: Show recent k3s-agent logs on failure
|
||||
command: journalctl -u k3s-agent -n 120 --no-pager
|
||||
register: k3s_agent_journal
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: agent_status is failed
|
||||
|
||||
- name: Fail with k3s-agent diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
k3s agent failed to become ready on {{ inventory_hostname }}.
|
||||
Install stdout:
|
||||
{{ k3s_agent_install.stdout | default('n/a') }}
|
||||
|
||||
Install stderr:
|
||||
{{ k3s_agent_install.stderr | default('n/a') }}
|
||||
|
||||
Service status:
|
||||
{{ k3s_agent_status.stdout | default('n/a') }}
|
||||
|
||||
Recent logs:
|
||||
{{ k3s_agent_journal.stdout | default('n/a') }}
|
||||
when: agent_status is failed
|
||||
|
||||
@@ -1,5 +1,17 @@
|
||||
---
|
||||
k3s_version: latest
|
||||
k3s_version: v1.34.6+k3s1
|
||||
k3s_token: ""
|
||||
k3s_node_ip: ""
|
||||
k3s_primary_public_ip: ""
|
||||
k3s_disable_embedded_ccm: false
|
||||
k3s_disable_servicelb: true
|
||||
k3s_kubelet_cloud_provider_external: false
|
||||
k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
# Load Balancer endpoint for HA cluster joins (set in inventory)
|
||||
kube_api_endpoint: ""
|
||||
# Tailscale DNS names for control planes (to enable tailnet access)
|
||||
# Using DNS names instead of IPs since Tailscale IPs change on rebuild
|
||||
tailscale_control_plane_names:
|
||||
- "k8s-cluster-cp-1.silverside-gopher.ts.net"
|
||||
- "k8s-cluster-cp-2.silverside-gopher.ts.net"
|
||||
- "k8s-cluster-cp-3.silverside-gopher.ts.net"
|
||||
|
||||
@@ -11,13 +11,25 @@
|
||||
failed_when: false
|
||||
when: k3s_service.stat.exists
|
||||
|
||||
- name: Check installed k3s version
|
||||
command: k3s --version
|
||||
register: installed_k3s_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: k3s_service.stat.exists
|
||||
|
||||
- name: Determine whether k3s install is needed
|
||||
set_fact:
|
||||
k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
|
||||
k3s_install_needed: >-
|
||||
{{
|
||||
(not k3s_service.stat.exists)
|
||||
or ((k3s_service_state.stdout | default('')) != 'active')
|
||||
or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
|
||||
}}
|
||||
|
||||
- name: Wait for primary API on 6443 (secondary only)
|
||||
- name: Wait for API endpoint on 6443 (secondary only)
|
||||
wait_for:
|
||||
host: "{{ k3s_primary_ip }}"
|
||||
host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 120
|
||||
@@ -28,41 +40,56 @@
|
||||
stat:
|
||||
path: /usr/local/bin/k3s-uninstall.sh
|
||||
register: k3s_uninstall_script
|
||||
when:
|
||||
- not (k3s_primary | default(false))
|
||||
- k3s_install_needed
|
||||
when: k3s_install_needed
|
||||
|
||||
- name: Reset broken secondary k3s install before rejoin
|
||||
- name: Reset broken k3s install before reinstall
|
||||
command: /usr/local/bin/k3s-uninstall.sh
|
||||
when:
|
||||
- not (k3s_primary | default(false))
|
||||
- k3s_install_needed
|
||||
- k3s_uninstall_script.stat.exists
|
||||
|
||||
- name: Remove stale k3s data on secondary
|
||||
- name: Remove stale k3s data
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /etc/rancher/k3s
|
||||
- /var/lib/rancher/k3s
|
||||
when:
|
||||
- not (k3s_primary | default(false))
|
||||
- k3s_install_needed
|
||||
when: k3s_install_needed
|
||||
|
||||
- name: Download k3s install script
|
||||
get_url:
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
register: k3s_install_script
|
||||
until: k3s_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_install_needed
|
||||
|
||||
- name: Install k3s server (primary)
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
K3S_TOKEN: "{{ k3s_token }}"
|
||||
command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
|
||||
when:
|
||||
command: >-
|
||||
/tmp/install-k3s.sh server
|
||||
--cluster-init
|
||||
--advertise-address={{ k3s_primary_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
--tls-san={{ k3s_primary_ip }}
|
||||
--tls-san={{ k3s_primary_public_ip }}
|
||||
--tls-san={{ kube_api_endpoint }}
|
||||
{% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: primary_install
|
||||
until: primary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
when:
|
||||
- k3s_install_needed
|
||||
- k3s_primary | default(false)
|
||||
|
||||
@@ -75,37 +102,19 @@
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
K3S_TOKEN: "{{ k3s_token }}"
|
||||
command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
|
||||
command: >-
|
||||
/tmp/install-k3s.sh server
|
||||
--server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
|
||||
--advertise-address={{ k3s_node_ip }}
|
||||
--node-ip={{ k3s_node_ip }}
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: secondary_install
|
||||
|
||||
rescue:
|
||||
- name: Show k3s service status after failed secondary install
|
||||
command: systemctl status k3s --no-pager
|
||||
register: k3s_status_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Show recent k3s logs after failed secondary install
|
||||
command: journalctl -u k3s -n 120 --no-pager
|
||||
register: k3s_journal_after_install
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail with secondary install diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
Secondary k3s install failed on {{ inventory_hostname }}.
|
||||
Install stdout:
|
||||
{{ secondary_install.stdout | default('n/a') }}
|
||||
|
||||
Install stderr:
|
||||
{{ secondary_install.stderr | default('n/a') }}
|
||||
|
||||
Service status:
|
||||
{{ k3s_status_after_install.stdout | default('n/a') }}
|
||||
|
||||
Recent logs:
|
||||
{{ k3s_journal_after_install.stdout | default('n/a') }}
|
||||
until: secondary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s to be ready
|
||||
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
kube_vip_version: v1.1.2
|
||||
kube_vip_interface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
kube_vip_address: "{{ kube_api_endpoint }}"
|
||||
kube_vip_prepull_images:
|
||||
- docker.io/rancher/mirrored-pause:3.6
|
||||
- ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
- name: Check for runner-provided kube-vip image archive
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: kube_vip_bootstrap_archive
|
||||
|
||||
- name: Copy runner-provided kube-vip image archive
|
||||
copy:
|
||||
src: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
|
||||
dest: /tmp/kube-vip-bootstrap.tar
|
||||
mode: "0644"
|
||||
when: kube_vip_bootstrap_archive.stat.exists
|
||||
|
||||
- name: Import runner-provided kube-vip image archive
|
||||
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
||||
changed_when: false
|
||||
when: kube_vip_bootstrap_archive.stat.exists
|
||||
|
||||
- name: Pre-pull kube-vip bootstrap images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if timeout 120s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: kube_vip_image_pull
|
||||
loop: "{{ kube_vip_prepull_images }}"
|
||||
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
|
||||
|
||||
- name: Render kube-vip control plane manifest
|
||||
template:
|
||||
src: kube-vip-control-plane.yaml.j2
|
||||
dest: /tmp/kube-vip-control-plane.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply kube-vip control plane manifest
|
||||
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
|
||||
register: kube_vip_apply
|
||||
until: kube_vip_apply.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for local kube-vip pod to be ready
|
||||
shell: >-
|
||||
kubectl -n kube-system get pods
|
||||
-l app.kubernetes.io/name=kube-vip
|
||||
--field-selector spec.nodeName={{ inventory_hostname }}
|
||||
-o jsonpath='{.items[0].status.conditions[?(@.type=="Ready")].status}'
|
||||
register: kube_vip_pod_ready
|
||||
changed_when: false
|
||||
until: kube_vip_pod_ready.stdout == "True"
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Show kube-vip pod status on failure
|
||||
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
|
||||
register: kube_vip_pods
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Describe kube-vip pod on failure
|
||||
shell: >-
|
||||
kubectl -n kube-system describe pod
|
||||
$(kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip --field-selector spec.nodeName={{ inventory_hostname }} -o jsonpath='{.items[0].metadata.name}')
|
||||
register: kube_vip_pod_describe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Fail with kube-vip diagnostics
|
||||
fail:
|
||||
msg: |
|
||||
kube-vip failed to become ready on {{ inventory_hostname }}.
|
||||
Pods:
|
||||
{{ kube_vip_pods.stdout | default('n/a') }}
|
||||
|
||||
Describe:
|
||||
{{ kube_vip_pod_describe.stdout | default('n/a') }}
|
||||
when: kube_vip_pod_ready is failed
|
||||
|
||||
- name: Wait for API VIP on 6443
|
||||
wait_for:
|
||||
host: "{{ kube_vip_address }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 180
|
||||
@@ -0,0 +1,110 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: system:kube-vip-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services/status"]
|
||||
verbs: ["update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints"]
|
||||
verbs: ["list", "get", "watch", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["list", "get", "watch", "update", "patch"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: ["leases"]
|
||||
verbs: ["list", "get", "watch", "update", "create"]
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: ["endpointslices"]
|
||||
verbs: ["list", "get", "watch", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: system:kube-vip-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:kube-vip-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kube-vip
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-vip
|
||||
spec:
|
||||
serviceAccountName: kube-vip
|
||||
hostNetwork: true
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: kube-vip
|
||||
image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- manager
|
||||
env:
|
||||
- name: vip_arp
|
||||
value: "true"
|
||||
- name: port
|
||||
value: "6443"
|
||||
- name: vip_interface
|
||||
value: {{ kube_vip_interface | quote }}
|
||||
- name: vip_subnet
|
||||
value: "32"
|
||||
- name: cp_enable
|
||||
value: "true"
|
||||
- name: cp_namespace
|
||||
value: kube-system
|
||||
- name: vip_ddns
|
||||
value: "false"
|
||||
- name: vip_leaderelection
|
||||
value: "true"
|
||||
- name: vip_leaseduration
|
||||
value: "5"
|
||||
- name: vip_renewdeadline
|
||||
value: "3"
|
||||
- name: vip_retryperiod
|
||||
value: "1"
|
||||
- name: address
|
||||
value: {{ kube_vip_address | quote }}
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- SYS_TIME
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
observability_namespace: "observability"
|
||||
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
|
||||
grafana_datasource_configmap_name: "grafana-datasources-core"
|
||||
loki_enabled: true
|
||||
grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
|
||||
grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
|
||||
grafana_use_prometheus_nodeport_fallback: true
|
||||
grafana_use_loki_nodeport_fallback: true
|
||||
@@ -0,0 +1,178 @@
|
||||
---
|
||||
- name: Ensure observability namespace exists
|
||||
command: kubectl create namespace {{ observability_namespace }}
|
||||
register: create_observability_ns
|
||||
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||
changed_when: create_observability_ns.rc == 0
|
||||
|
||||
- name: Wait for Grafana deployment rollout
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Set default Prometheus datasource URL
|
||||
set_fact:
|
||||
grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
|
||||
grafana_loki_effective_url: "{{ grafana_loki_url }}"
|
||||
|
||||
- name: Get Grafana pod name
|
||||
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
|
||||
register: grafana_pod_name
|
||||
changed_when: false
|
||||
|
||||
- name: Probe Prometheus from Grafana pod via default datasource URL
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||
sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
|
||||
register: grafana_prometheus_probe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Probe Loki from Grafana pod via default datasource URL
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||
sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
|
||||
register: grafana_loki_probe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Get Prometheus pod host IP for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
|
||||
register: prometheus_host_ip
|
||||
changed_when: false
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
|
||||
- name: Get Prometheus service NodePort for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
|
||||
register: prometheus_nodeport
|
||||
changed_when: false
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
|
||||
- name: Enable Prometheus NodePort fallback datasource URL
|
||||
set_fact:
|
||||
grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
- prometheus_host_ip.stdout | length > 0
|
||||
- prometheus_nodeport.stdout | length > 0
|
||||
|
||||
- name: Ensure Loki service uses NodePort for fallback
|
||||
command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- loki_enabled
|
||||
- grafana_use_loki_nodeport_fallback | bool
|
||||
- grafana_loki_probe.rc != 0
|
||||
|
||||
- name: Get Loki pod host IP for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
|
||||
register: loki_host_ip
|
||||
changed_when: false
|
||||
when:
|
||||
- loki_enabled
|
||||
- grafana_use_loki_nodeport_fallback | bool
|
||||
- grafana_loki_probe.rc != 0
|
||||
|
||||
- name: Get Loki service NodePort for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
|
||||
register: loki_nodeport
|
||||
changed_when: false
|
||||
when:
|
||||
- loki_enabled
|
||||
- grafana_use_loki_nodeport_fallback | bool
|
||||
- grafana_loki_probe.rc != 0
|
||||
|
||||
- name: Enable Loki NodePort fallback datasource URL
|
||||
set_fact:
|
||||
grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
|
||||
when:
|
||||
- loki_enabled
|
||||
- grafana_use_loki_nodeport_fallback | bool
|
||||
- grafana_loki_probe.rc != 0
|
||||
- loki_host_ip.stdout | length > 0
|
||||
- loki_nodeport.stdout | length > 0
|
||||
|
||||
- name: Query Loki labels endpoint from Grafana pod
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||
sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
|
||||
register: grafana_loki_labels
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
until: >-
|
||||
grafana_loki_labels.rc != 0 or
|
||||
'"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: loki_enabled
|
||||
|
||||
- name: Fail when Loki is reachable but has zero indexed labels
|
||||
fail:
|
||||
msg: >-
|
||||
Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
|
||||
This usually means no logs are ingested yet. Check Promtail and tenant configuration.
|
||||
when:
|
||||
- loki_enabled
|
||||
- grafana_loki_labels.rc == 0
|
||||
- "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
|
||||
- "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
|
||||
|
||||
- name: Write default Prometheus datasource ConfigMap patch
|
||||
template:
|
||||
src: grafana-default-prometheus-datasource.yaml.j2
|
||||
dest: /tmp/grafana-default-prometheus-datasource.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply default Prometheus datasource ConfigMap patch
|
||||
command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Remove legacy Loki datasource ConfigMap
|
||||
command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Write Grafana datasources ConfigMap
|
||||
template:
|
||||
src: grafana-datasources.yaml.j2
|
||||
dest: /tmp/grafana-datasources.yaml
|
||||
mode: "0644"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Apply Grafana datasources ConfigMap
|
||||
command: kubectl apply -f /tmp/grafana-datasources.yaml
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Restart Grafana to load datasource updates deterministically
|
||||
command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for Grafana rollout after datasource update
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Write Grafana dashboard ConfigMap
|
||||
template:
|
||||
src: grafana-dashboard-k8s-overview.yaml.j2
|
||||
dest: /tmp/grafana-dashboard-k8s-overview.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply Grafana dashboard ConfigMap
|
||||
command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Show Grafana content provisioning summary
|
||||
debug:
|
||||
msg: |
|
||||
Grafana content applied.
|
||||
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
|
||||
Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
|
||||
Loki datasource URL: {{ grafana_loki_effective_url }}
|
||||
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
|
||||
@@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ grafana_dashboard_configmap_name }}
|
||||
namespace: {{ observability_namespace }}
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
k8s-overview.json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"legendFormat": "ready",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ready Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||
"legendFormat": "cpu",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cluster CPU Usage",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["kubernetes", "infrastructure"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"timezone": "browser",
|
||||
"title": "K8s Cluster Overview",
|
||||
"uid": "k8s-cluster-overview",
|
||||
"version": 1
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ grafana_datasource_configmap_name }}
|
||||
namespace: {{ observability_namespace }}
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
{% if loki_enabled %}
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: "{{ grafana_loki_effective_url }}"
|
||||
isDefault: false
|
||||
{% endif %}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kube-prometheus-stack-grafana-datasource
|
||||
namespace: {{ observability_namespace }}
|
||||
data:
|
||||
datasource.yaml: |-
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: "Prometheus"
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: "{{ grafana_prometheus_effective_url }}/"
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
timeInterval: 30s
|
||||
- name: "Alertmanager"
|
||||
type: alertmanager
|
||||
uid: alertmanager
|
||||
url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
|
||||
access: proxy
|
||||
jsonData:
|
||||
handleGrafanaManagedAlerts: false
|
||||
implementation: prometheus
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
observability_namespace: "observability"
|
||||
|
||||
prometheus_chart_version: "68.4.4"
|
||||
loki_chart_version: "6.10.0"
|
||||
promtail_chart_version: "6.16.6"
|
||||
|
||||
grafana_admin_password: ""
|
||||
|
||||
prometheus_storage_size: "10Gi"
|
||||
grafana_storage_size: "5Gi"
|
||||
loki_storage_size: "10Gi"
|
||||
|
||||
prometheus_storage_class: "local-path"
|
||||
grafana_storage_class: "local-path"
|
||||
loki_storage_class: "local-path"
|
||||
|
||||
loki_enabled: true
|
||||
|
||||
tailscale_oauth_client_id: ""
|
||||
tailscale_oauth_client_secret: ""
|
||||
tailscale_tailnet: ""
|
||||
|
||||
observability_tailscale_expose: true
|
||||
grafana_tailscale_hostname: "grafana"
|
||||
prometheus_tailscale_hostname: "prometheus"
|
||||
tailscale_proxyclass_name: "infra-stable"
|
||||
@@ -0,0 +1,252 @@
|
||||
---
|
||||
- name: Check if Helm is installed
|
||||
command: helm version --short
|
||||
register: helm_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Install Helm
|
||||
shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
when: helm_check.rc != 0
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure observability namespace exists
|
||||
command: kubectl create namespace {{ observability_namespace }}
|
||||
register: create_observability_ns
|
||||
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||
changed_when: create_observability_ns.rc == 0
|
||||
|
||||
- name: Set Grafana admin password
|
||||
set_fact:
|
||||
grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
|
||||
|
||||
- name: Write kube-prometheus-stack values
|
||||
template:
|
||||
src: kube-prometheus-stack-values.yaml.j2
|
||||
dest: /tmp/kube-prometheus-stack-values.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Add Prometheus Helm repo
|
||||
command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
register: add_prom_repo
|
||||
failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
|
||||
changed_when: add_prom_repo.rc == 0
|
||||
|
||||
- name: Add Grafana Helm repo
|
||||
command: helm repo add grafana https://grafana.github.io/helm-charts
|
||||
register: add_grafana_repo
|
||||
failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
|
||||
changed_when: add_grafana_repo.rc == 0
|
||||
|
||||
- name: Update Helm repos
|
||||
command: helm repo update
|
||||
changed_when: false
|
||||
|
||||
- name: Clear stale pending Helm revision secrets for kube-prometheus-stack
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} delete
|
||||
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
|
||||
--ignore-not-found=true;
|
||||
kubectl -n {{ observability_namespace }} delete
|
||||
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
|
||||
--ignore-not-found=true;
|
||||
kubectl -n {{ observability_namespace }} delete
|
||||
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
|
||||
--ignore-not-found=true
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Install kube-prometheus-stack
|
||||
command: >-
|
||||
helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
|
||||
--namespace {{ observability_namespace }}
|
||||
--version {{ prometheus_chart_version }}
|
||||
--values /tmp/kube-prometheus-stack-values.yaml
|
||||
--wait
|
||||
--timeout 10m
|
||||
register: kube_prom_install
|
||||
retries: 12
|
||||
delay: 15
|
||||
until: kube_prom_install.rc == 0
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for Grafana deployment rollout
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Reset Grafana admin password in Grafana database
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec
|
||||
"$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
|
||||
-c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
|
||||
changed_when: true
|
||||
|
||||
- name: Write Loki values
|
||||
template:
|
||||
src: loki-values.yaml.j2
|
||||
dest: /tmp/loki-values.yaml
|
||||
mode: "0644"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Validate Loki chart produces resources
|
||||
command: >-
|
||||
helm template loki grafana/loki
|
||||
--namespace {{ observability_namespace }}
|
||||
--version {{ loki_chart_version }}
|
||||
--values /tmp/loki-values.yaml
|
||||
register: loki_template
|
||||
changed_when: false
|
||||
failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Remove legacy Loki resources
|
||||
command: >-
|
||||
kubectl -n {{ observability_namespace }} delete
|
||||
deployment/loki-gateway
|
||||
statefulset/loki
|
||||
statefulset/loki-chunks-cache
|
||||
statefulset/loki-results-cache
|
||||
statefulset/loki-backend
|
||||
statefulset/loki-read
|
||||
statefulset/loki-write
|
||||
poddisruptionbudget/loki-memcached-chunks-cache
|
||||
poddisruptionbudget/loki-memcached-results-cache
|
||||
--ignore-not-found=true
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Clear stuck Helm lock for Loki
|
||||
command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Uninstall failed Loki release (if stuck)
|
||||
command: helm uninstall loki -n {{ observability_namespace }}
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Install Loki
|
||||
command: >-
|
||||
helm upgrade --install loki grafana/loki
|
||||
--namespace {{ observability_namespace }}
|
||||
--version {{ loki_chart_version }}
|
||||
--values /tmp/loki-values.yaml
|
||||
register: loki_install
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Wait for Loki StatefulSet
|
||||
command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
|
||||
register: loki_rollout
|
||||
changed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Show Loki pod status
|
||||
command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
|
||||
register: loki_pods
|
||||
changed_when: false
|
||||
when: loki_enabled
|
||||
|
||||
- name: Debug Loki pods
|
||||
debug:
|
||||
msg: "{{ loki_pods.stdout }}"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Write Promtail values
|
||||
template:
|
||||
src: promtail-values.yaml.j2
|
||||
dest: /tmp/promtail-values.yaml
|
||||
mode: "0644"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Install Promtail
|
||||
command: >-
|
||||
helm upgrade --install promtail grafana/promtail
|
||||
--namespace {{ observability_namespace }}
|
||||
--version {{ promtail_chart_version }}
|
||||
--values /tmp/promtail-values.yaml
|
||||
--wait
|
||||
--timeout 10m
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Check Tailscale service readiness for Grafana
|
||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||
register: grafana_tailscale_ready
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- observability_tailscale_expose | bool
|
||||
- tailscale_operator_ready | default(false) | bool
|
||||
|
||||
- name: Check Tailscale service readiness for Prometheus
|
||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||
register: prometheus_tailscale_ready
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- observability_tailscale_expose | bool
|
||||
- tailscale_operator_ready | default(false) | bool
|
||||
|
||||
- name: Check Tailscale endpoint (IP/hostname) for Grafana
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
|
||||
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
|
||||
register: grafana_lb_ip
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- observability_tailscale_expose | bool
|
||||
- tailscale_operator_ready | default(false) | bool
|
||||
|
||||
- name: Check Tailscale endpoint (IP/hostname) for Prometheus
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
|
||||
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
|
||||
register: prometheus_lb_ip
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- observability_tailscale_expose | bool
|
||||
- tailscale_operator_ready | default(false) | bool
|
||||
|
||||
- name: Show Tailscale access details
|
||||
debug:
|
||||
msg: |
|
||||
Observability stack deployed with Tailscale access!
|
||||
|
||||
Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
|
||||
Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
|
||||
|
||||
Login: admin / {{ grafana_password_effective }}
|
||||
|
||||
Tailscale readiness:
|
||||
- Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
|
||||
- Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
|
||||
|
||||
Access via:
|
||||
- MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
|
||||
- Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
|
||||
- Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
|
||||
when:
|
||||
- observability_tailscale_expose | bool
|
||||
- tailscale_operator_ready | default(false) | bool
|
||||
|
||||
- name: Show observability access details (fallback)
|
||||
debug:
|
||||
msg: |
|
||||
Observability stack deployed.
|
||||
Namespace: {{ observability_namespace }}
|
||||
Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
|
||||
Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
|
||||
Grafana admin password: {{ grafana_password_effective }}
|
||||
{% if loki_enabled %}
|
||||
Loki: Enabled - logs available in Grafana
|
||||
{% else %}
|
||||
Loki: Disabled
|
||||
{% endif %}
|
||||
when:
|
||||
- not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasource-loki
|
||||
namespace: {{ observability_namespace }}
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
data:
|
||||
loki-datasource.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
|
||||
isDefault: false
|
||||
@@ -0,0 +1,46 @@
|
||||
grafana:
|
||||
enabled: true
|
||||
adminPassword: {{ grafana_password_effective }}
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: {{ grafana_storage_class }}
|
||||
size: {{ grafana_storage_size }}
|
||||
service:
|
||||
{% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
annotations:
|
||||
tailscale.com/hostname: {{ grafana_tailscale_hostname }}
|
||||
tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
|
||||
{% else %}
|
||||
type: ClusterIP
|
||||
{% endif %}
|
||||
prometheus:
|
||||
service:
|
||||
{% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
annotations:
|
||||
tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
|
||||
tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
|
||||
{% else %}
|
||||
type: ClusterIP
|
||||
{% endif %}
|
||||
prometheusSpec:
|
||||
retention: 7d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: {{ prometheus_storage_class }}
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ prometheus_storage_size }}
|
||||
alertmanager:
|
||||
enabled: false
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
@@ -0,0 +1,75 @@
|
||||
deploymentMode: SingleBinary
|
||||
|
||||
loki:
|
||||
auth_enabled: false
|
||||
commonConfig:
|
||||
replication_factor: 1
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: "2024-04-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
storage:
|
||||
type: filesystem
|
||||
limits_config:
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
retention_period: 168h
|
||||
pattern_ingester:
|
||||
enabled: true
|
||||
ruler:
|
||||
enable_api: true
|
||||
|
||||
singleBinary:
|
||||
replicas: 1
|
||||
persistence:
|
||||
size: {{ loki_storage_size }}
|
||||
storageClass: {{ loki_storage_class }}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
|
||||
backend:
|
||||
replicas: 0
|
||||
read:
|
||||
replicas: 0
|
||||
write:
|
||||
replicas: 0
|
||||
ingester:
|
||||
replicas: 0
|
||||
querier:
|
||||
replicas: 0
|
||||
queryFrontend:
|
||||
replicas: 0
|
||||
queryScheduler:
|
||||
replicas: 0
|
||||
distributor:
|
||||
replicas: 0
|
||||
compactor:
|
||||
replicas: 0
|
||||
indexGateway:
|
||||
replicas: 0
|
||||
bloomCompactor:
|
||||
replicas: 0
|
||||
bloomGateway:
|
||||
replicas: 0
|
||||
|
||||
gateway:
|
||||
enabled: false
|
||||
|
||||
test:
|
||||
enabled: false
|
||||
|
||||
monitoring:
|
||||
selfMonitoring:
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
@@ -0,0 +1,3 @@
|
||||
config:
|
||||
clients:
|
||||
- url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
rancher_images_to_prepull:
|
||||
- docker.io/rancher/rancher:v2.13.3
|
||||
- docker.io/rancher/rancher-webhook:v0.9.3
|
||||
- docker.io/rancher/system-upgrade-controller:v0.17.0
|
||||
- docker.io/rancher/shell:v0.6.2
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Check for runner-provided Rancher image archives
|
||||
stat:
|
||||
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
register: rancher_image_archive_stats
|
||||
loop: "{{ rancher_images_to_prepull }}"
|
||||
|
||||
- name: Ensure remote Rancher image archive directory exists
|
||||
file:
|
||||
path: /tmp/bootstrap-image-archives
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: Copy runner-provided Rancher image archives
|
||||
copy:
|
||||
src: "{{ item.stat.path }}"
|
||||
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
|
||||
mode: "0644"
|
||||
loop: "{{ rancher_image_archive_stats.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.stat.exists
|
||||
|
||||
- name: Import or pull Rancher images into containerd
|
||||
shell: |
|
||||
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "already present"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
|
||||
if [ -s "${archive}" ]; then
|
||||
for attempt in 1 2 3; do
|
||||
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
|
||||
echo "imported image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
fi
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
|
||||
echo "pulled image"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: rancher_image_pull
|
||||
loop: "{{ rancher_images_to_prepull }}"
|
||||
changed_when: "'imported image' in rancher_image_pull.stdout or 'pulled image' in rancher_image_pull.stdout"
|
||||
@@ -0,0 +1,61 @@
|
||||
---
|
||||
- name: Delete stale Tailscale devices with reserved hostnames
|
||||
block:
|
||||
- name: Get Tailscale devices from API
|
||||
uri:
|
||||
url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
|
||||
method: GET
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
return_content: true
|
||||
register: ts_devices
|
||||
until: ts_devices.status == 200
|
||||
retries: 5
|
||||
delay: 10
|
||||
|
||||
- name: Find stale devices matching reserved hostnames
|
||||
set_fact:
|
||||
stale_devices: >-
|
||||
{{ (ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('connectedToControl', 'defined')
|
||||
| rejectattr('connectedToControl', 'equalto', true)
|
||||
| list
|
||||
+
|
||||
ts_devices.json.devices | default([])
|
||||
| selectattr('hostname', 'defined')
|
||||
| selectattr('hostname', 'in', tailscale_reserved_hostnames)
|
||||
| selectattr('online', 'defined')
|
||||
| rejectattr('online', 'equalto', true)
|
||||
| list) | unique(attribute='id') | list }}
|
||||
|
||||
- name: Delete stale devices
|
||||
uri:
|
||||
url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
|
||||
method: DELETE
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
status_code: 200
|
||||
register: ts_delete_device
|
||||
until: ts_delete_device.status == 200
|
||||
retries: 3
|
||||
delay: 5
|
||||
loop: "{{ stale_devices }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }} ({{ item.id }})"
|
||||
when: stale_devices | length > 0
|
||||
|
||||
- name: Report cleaned devices
|
||||
debug:
|
||||
msg: "Deleted stale Tailscale device: {{ item.name }}"
|
||||
loop: "{{ stale_devices }}"
|
||||
when: stale_devices | length > 0
|
||||
|
||||
- name: No stale devices found
|
||||
debug:
|
||||
msg: "No stale Tailscale devices found."
|
||||
when: stale_devices | length == 0
|
||||
when:
|
||||
- tailscale_api_key is defined
|
||||
- tailscale_api_key | length > 0
|
||||
+166
-8
@@ -1,14 +1,26 @@
|
||||
---
|
||||
- name: Clean up stale Tailscale cluster node devices
|
||||
hosts: localhost
|
||||
connection: local
|
||||
vars:
|
||||
tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}"
|
||||
|
||||
roles:
|
||||
- tailscale-cleanup
|
||||
|
||||
- name: Bootstrap Kubernetes cluster
|
||||
hosts: cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
gather_facts: false
|
||||
|
||||
pre_tasks:
|
||||
- name: Wait for SSH
|
||||
wait_for_connection:
|
||||
delay: 10
|
||||
timeout: 300
|
||||
timeout: 600
|
||||
|
||||
- name: Gather facts after SSH is reachable
|
||||
setup:
|
||||
|
||||
roles:
|
||||
- common
|
||||
@@ -24,6 +36,7 @@
|
||||
k3s_primary_public_ip: "{{ ansible_host }}"
|
||||
k3s_primary_ip: "{{ k3s_private_ip }}"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
# kube_api_endpoint is set in inventory group_vars
|
||||
|
||||
roles:
|
||||
- k3s-server
|
||||
@@ -49,6 +62,32 @@
|
||||
dest: ../outputs/kubeconfig
|
||||
flat: true
|
||||
|
||||
- name: Bootstrap addon prerequisite secrets
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- addon-secrets-bootstrap
|
||||
|
||||
- name: Deploy kube-vip for API HA
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- kube-vip-deploy
|
||||
|
||||
- name: Wait for Kubernetes API VIP readiness
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
tasks:
|
||||
- name: Wait for Kubernetes readyz through the VIP
|
||||
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||
register: api_readyz
|
||||
until: api_readyz.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Setup secondary control planes
|
||||
hosts: control_plane[1:]
|
||||
become: true
|
||||
@@ -59,44 +98,163 @@
|
||||
k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
|
||||
k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
# Use Load Balancer for HA - all control planes join via LB endpoint
|
||||
k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
|
||||
|
||||
roles:
|
||||
- k3s-server
|
||||
|
||||
- name: Export kube-vip image from primary control plane
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
tasks:
|
||||
- name: Export kube-vip image for secondary control planes
|
||||
command: >-
|
||||
/usr/local/bin/ctr -n k8s.io images export
|
||||
/tmp/kube-vip-bootstrap.tar
|
||||
ghcr.io/kube-vip/kube-vip:v1.1.2
|
||||
changed_when: false
|
||||
|
||||
- name: Fetch kube-vip image archive
|
||||
fetch:
|
||||
src: /tmp/kube-vip-bootstrap.tar
|
||||
dest: ../outputs/kube-vip-bootstrap.tar
|
||||
flat: true
|
||||
|
||||
- name: Seed kube-vip image on secondary control planes
|
||||
hosts: control_plane[1:]
|
||||
become: true
|
||||
|
||||
tasks:
|
||||
- name: Copy kube-vip image archive
|
||||
copy:
|
||||
src: ../outputs/kube-vip-bootstrap.tar
|
||||
dest: /tmp/kube-vip-bootstrap.tar
|
||||
mode: "0644"
|
||||
|
||||
- name: Import kube-vip image into containerd
|
||||
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
||||
register: kube_vip_secondary_import
|
||||
until: kube_vip_secondary_import.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for all control plane nodes to be Ready
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
tasks:
|
||||
- name: Wait for control plane node readiness
|
||||
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
||||
register: control_plane_ready
|
||||
until: control_plane_ready.rc == 0
|
||||
retries: 20
|
||||
delay: 15
|
||||
changed_when: false
|
||||
loop: "{{ groups['control_plane'] }}"
|
||||
|
||||
- name: Wait for Kubernetes readyz before worker joins
|
||||
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||
register: api_readyz_before_workers
|
||||
until: api_readyz_before_workers.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Setup workers
|
||||
hosts: workers
|
||||
become: true
|
||||
|
||||
vars:
|
||||
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
|
||||
k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
|
||||
# Use Load Balancer for HA - workers join via LB endpoint
|
||||
k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
|
||||
k3s_node_ip: "{{ k3s_private_ip }}"
|
||||
|
||||
roles:
|
||||
- k3s-agent
|
||||
|
||||
- name: Deploy Hetzner CCM
|
||||
- name: Pre-pull bootstrap control-plane images
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- ccm
|
||||
- bootstrap-image-prepull
|
||||
|
||||
- name: Deploy Hetzner CSI
|
||||
- name: Pre-pull Rancher bootstrap images
|
||||
hosts: workers
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: rancher-image-prepull
|
||||
when: rancher_image_prepull_enabled | default(false) | bool
|
||||
|
||||
- name: Deploy observability stack
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- csi
|
||||
- role: observability
|
||||
when: not (observability_gitops_enabled | default(true) | bool)
|
||||
|
||||
- name: Provision Grafana content
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: observability-content
|
||||
when: not (observability_gitops_enabled | default(true) | bool)
|
||||
|
||||
- name: Bootstrap Doppler access for External Secrets
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- doppler-bootstrap
|
||||
|
||||
- name: Detect existing Tailscale service proxies
|
||||
hosts: control_plane[0]
|
||||
become: true
|
||||
tasks:
|
||||
- name: Check for current Tailscale service hostnames
|
||||
command: kubectl get svc -A -o jsonpath='{range .items[*]}{.metadata.annotations.tailscale\.com/hostname}{"\n"}{end}'
|
||||
register: existing_tailscale_hostnames
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Clean up stale Tailscale devices
|
||||
hosts: localhost
|
||||
connection: local
|
||||
vars:
|
||||
tailscale_reserved_hostnames:
|
||||
- rancher
|
||||
- grafana
|
||||
- prometheus
|
||||
- flux
|
||||
tasks:
|
||||
- name: Delete stale devices only before service proxies exist
|
||||
include_role:
|
||||
name: tailscale-cleanup
|
||||
when: >-
|
||||
hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([])
|
||||
| intersect(tailscale_reserved_hostnames)
|
||||
| length == 0
|
||||
|
||||
- name: Finalize
|
||||
hosts: localhost
|
||||
connection: local
|
||||
tasks:
|
||||
- name: Check whether kubeconfig was fetched
|
||||
stat:
|
||||
path: ../outputs/kubeconfig
|
||||
register: kubeconfig_file
|
||||
|
||||
- name: Update kubeconfig server address
|
||||
command: |
|
||||
sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
|
||||
sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
|
||||
changed_when: true
|
||||
when: kubeconfig_file.stat.exists
|
||||
|
||||
- name: Display success message
|
||||
debug:
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: GitRepository
|
||||
metadata:
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m
|
||||
ref:
|
||||
branch: main
|
||||
url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
|
||||
secretRef:
|
||||
name: flux-system
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,59 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: source-controller
|
||||
namespace: flux-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kustomize-controller
|
||||
namespace: flux-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: helm-controller
|
||||
namespace: flux-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: notification-controller
|
||||
namespace: flux-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: apps
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./apps
|
||||
dependsOn:
|
||||
- name: infrastructure
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: true
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: infrastructure
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure
|
||||
wait: false
|
||||
timeout: 5m
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- gotk-components.yaml
|
||||
- gitrepository-platform.yaml
|
||||
- kustomization-infrastructure.yaml
|
||||
- kustomization-apps.yaml
|
||||
patchesStrategicMerge:
|
||||
- gotk-controller-cp1-patches.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- flux-system
|
||||
@@ -0,0 +1,34 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: cert-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 15m
|
||||
targetNamespace: cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: ./infrastructure/charts/cert-manager
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
crds:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrelease-cert-manager.yaml
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
labels:
|
||||
kustomize.toolkit.fluxcd.io/prune: disabled
|
||||
+13
@@ -0,0 +1,13 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
metadata:
|
||||
name: doppler-hetznerterra
|
||||
spec:
|
||||
provider:
|
||||
doppler:
|
||||
auth:
|
||||
secretRef:
|
||||
dopplerToken:
|
||||
name: doppler-hetznerterra-service-token
|
||||
key: dopplerToken
|
||||
namespace: external-secrets
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- clustersecretstore-doppler-hetznerterra.yaml
|
||||
@@ -0,0 +1,44 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: external-secrets
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
installCRDs: true
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
webhook:
|
||||
failurePolicy: Ignore
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
certController:
|
||||
image:
|
||||
repository: oci.external-secrets.io/external-secrets/external-secrets
|
||||
tag: v2.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- ocirepository-external-secrets.yaml
|
||||
- helmrelease-external-secrets.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: external-secrets
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/external-secrets/charts/external-secrets
|
||||
ref:
|
||||
tag: 2.1.0
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-cert-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/cert-manager
|
||||
wait: true
|
||||
timeout: 20m
|
||||
suspend: false
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-external-secrets-store
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/external-secrets-store
|
||||
dependsOn:
|
||||
- name: addon-external-secrets
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
name: doppler-hetznerterra
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -0,0 +1,28 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-external-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/external-secrets
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: external-secrets
|
||||
namespace: flux-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: external-secrets-external-secrets
|
||||
namespace: external-secrets
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: external-secrets-external-secrets-webhook
|
||||
namespace: external-secrets
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-nfs-storage
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/nfs-storage
|
||||
wait: true
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability-content
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability-content
|
||||
dependsOn:
|
||||
- name: addon-observability
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -0,0 +1,26 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability-secrets
|
||||
dependsOn:
|
||||
- name: addon-external-secrets-store
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: grafana-admin
|
||||
namespace: observability
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: grafana-admin-credentials
|
||||
namespace: observability
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -0,0 +1,33 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-observability
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/observability
|
||||
dependsOn:
|
||||
- name: addon-observability-secrets
|
||||
- name: addon-nfs-storage
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: kube-prometheus-stack
|
||||
namespace: flux-system
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
timeout: 30m
|
||||
suspend: false
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-config
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-config
|
||||
dependsOn:
|
||||
- name: addon-rancher
|
||||
wait: true
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -0,0 +1,34 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher-secrets
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher-secrets
|
||||
dependsOn:
|
||||
- name: addon-external-secrets-store
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: flux-system
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: flux-system
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: cattle-system
|
||||
- apiVersion: v1
|
||||
kind: Secret
|
||||
name: rancher-bootstrap-password
|
||||
namespace: cattle-system
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -0,0 +1,41 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-rancher
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/rancher
|
||||
timeout: 30m
|
||||
suspend: false
|
||||
dependsOn:
|
||||
- name: addon-tailscale-operator
|
||||
- name: addon-tailscale-proxyclass
|
||||
- name: addon-rancher-secrets
|
||||
- name: addon-cert-manager
|
||||
wait: false
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: rancher
|
||||
namespace: flux-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: cattle-system-rancher
|
||||
namespace: cattle-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: rancher-webhook
|
||||
namespace: cattle-system
|
||||
- apiVersion: cert-manager.io/v1
|
||||
kind: Issuer
|
||||
name: cattle-system-rancher
|
||||
namespace: cattle-system
|
||||
- apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
name: tls-rancher-ingress
|
||||
namespace: cattle-system
|
||||
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-tailscale-operator
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/tailscale-operator
|
||||
wait: false
|
||||
timeout: 10m
|
||||
suspend: false
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: addon-tailscale-proxyclass
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
path: ./infrastructure/addons/tailscale-proxyclass
|
||||
dependsOn:
|
||||
- name: addon-tailscale-operator
|
||||
wait: true
|
||||
timeout: 5m
|
||||
suspend: false
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- kustomization-nfs-storage.yaml
|
||||
- kustomization-external-secrets.yaml
|
||||
- kustomization-external-secrets-store.yaml
|
||||
- kustomization-cert-manager.yaml
|
||||
- kustomization-tailscale-operator.yaml
|
||||
- kustomization-tailscale-proxyclass.yaml
|
||||
- traefik
|
||||
- kustomization-observability-secrets.yaml
|
||||
- kustomization-observability.yaml
|
||||
- kustomization-observability-content.yaml
|
||||
- kustomization-rancher-secrets.yaml
|
||||
- kustomization-rancher.yaml
|
||||
- kustomization-rancher-config.yaml
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner-runner
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumes"]
|
||||
verbs: ["get", "list", "watch", "create", "delete"]
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumeclaims"]
|
||||
verbs: ["get", "list", "watch", "update"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["create", "update", "patch"]
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: run-nfs-subdir-external-provisioner
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: nfs-subdir-external-provisioner-runner
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,41 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nfs-subdir-external-provisioner
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nfs-subdir-external-provisioner
|
||||
spec:
|
||||
serviceAccountName: nfs-subdir-external-provisioner
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: nfs-subdir-external-provisioner
|
||||
image: registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: PROVISIONER_NAME
|
||||
value: flash-nfs
|
||||
- name: NFS_SERVER
|
||||
value: 10.27.27.239
|
||||
- name: NFS_PATH
|
||||
value: /TheFlash/k8s-nfs
|
||||
volumeMounts:
|
||||
- name: nfs-subdir-external-provisioner-root
|
||||
mountPath: /persistentvolumes
|
||||
volumes:
|
||||
- name: nfs-subdir-external-provisioner-root
|
||||
nfs:
|
||||
server: 10.27.27.239
|
||||
path: /TheFlash/k8s-nfs
|
||||
@@ -0,0 +1,10 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- serviceaccount-nfs-subdir-external-provisioner.yaml
|
||||
- clusterrole-nfs-subdir-external-provisioner.yaml
|
||||
- clusterrolebinding-nfs-subdir-external-provisioner.yaml
|
||||
- role-nfs-subdir-external-provisioner.yaml
|
||||
- rolebinding-nfs-subdir-external-provisioner.yaml
|
||||
- storageclass-flash-nfs.yaml
|
||||
- deployment-nfs-subdir-external-provisioner.yaml
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch"]
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: leader-locking-nfs-subdir-external-provisioner
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nfs-subdir-external-provisioner
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: flash-nfs
|
||||
annotations:
|
||||
storageclass.kubernetes.io/is-default-class: "true"
|
||||
provisioner: flash-nfs
|
||||
parameters:
|
||||
archiveOnDelete: "true"
|
||||
reclaimPolicy: Delete
|
||||
allowVolumeExpansion: true
|
||||
volumeBindingMode: Immediate
|
||||
+60
@@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-k8s-overview
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
k8s-overview.json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"legendFormat": "ready",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ready Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||
"legendFormat": "cpu",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cluster CPU Usage",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["kubernetes", "infrastructure"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"timezone": "browser",
|
||||
"title": "K8s Cluster Overview",
|
||||
"uid": "k8s-cluster-overview",
|
||||
"version": 1
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources-core
|
||||
namespace: observability
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: "http://loki.observability.svc.cluster.local:3100"
|
||||
isDefault: false
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- grafana-datasources-core-configmap.yaml
|
||||
- grafana-dashboard-k8s-overview-configmap.yaml
|
||||
@@ -0,0 +1,22 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: grafana-admin
|
||||
namespace: observability
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: doppler-hetznerterra
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: grafana-admin-credentials
|
||||
creationPolicy: Owner
|
||||
template:
|
||||
type: Opaque
|
||||
data:
|
||||
admin-user: admin
|
||||
admin-password: "{{ .grafanaAdminPassword }}"
|
||||
data:
|
||||
- secretKey: grafanaAdminPassword
|
||||
remoteRef:
|
||||
key: GRAFANA_ADMIN_PASSWORD
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- grafana-admin-externalsecret.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
@@ -0,0 +1,19 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana-tailscale
|
||||
namespace: observability
|
||||
annotations:
|
||||
tailscale.com/hostname: grafana
|
||||
tailscale.com/tags: "tag:prod,tag:grafana"
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: grafana
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 3000
|
||||
@@ -0,0 +1,76 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: kube-prometheus-stack
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 15m
|
||||
targetNamespace: observability
|
||||
chart:
|
||||
spec:
|
||||
chart: ./infrastructure/charts/kube-prometheus-stack
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: platform
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
strategy: uninstall
|
||||
values:
|
||||
grafana:
|
||||
enabled: true
|
||||
admin:
|
||||
existingSecret: grafana-admin-credentials
|
||||
grafana.ini:
|
||||
server:
|
||||
root_url: http://grafana.silverside-gopher.ts.net/
|
||||
serve_from_sub_path: false
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: local-path
|
||||
size: 5Gi
|
||||
service:
|
||||
type: ClusterIP
|
||||
sidecar:
|
||||
datasources:
|
||||
enabled: true
|
||||
label: grafana_datasource
|
||||
searchNamespace: observability
|
||||
dashboards:
|
||||
enabled: true
|
||||
label: grafana_dashboard
|
||||
searchNamespace: observability
|
||||
prometheus:
|
||||
service:
|
||||
type: ClusterIP
|
||||
prometheusSpec:
|
||||
externalUrl: http://prometheus.silverside-gopher.ts.net:9090/
|
||||
routePrefix: /
|
||||
retention: 7d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: local-path
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
alertmanager:
|
||||
enabled: false
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
prometheus-node-exporter:
|
||||
hostNetwork: false
|
||||
service:
|
||||
hostPort: false
|
||||
@@ -0,0 +1,95 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
targetNamespace: observability
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
deploymentMode: SingleBinary
|
||||
loki:
|
||||
auth_enabled: false
|
||||
commonConfig:
|
||||
replication_factor: 1
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: "2024-04-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
storage:
|
||||
type: filesystem
|
||||
limits_config:
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
retention_period: 168h
|
||||
pattern_ingester:
|
||||
enabled: true
|
||||
ruler:
|
||||
enable_api: true
|
||||
singleBinary:
|
||||
replicas: 1
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: flash-nfs
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
backend:
|
||||
replicas: 0
|
||||
read:
|
||||
replicas: 0
|
||||
write:
|
||||
replicas: 0
|
||||
ingester:
|
||||
replicas: 0
|
||||
querier:
|
||||
replicas: 0
|
||||
queryFrontend:
|
||||
replicas: 0
|
||||
queryScheduler:
|
||||
replicas: 0
|
||||
distributor:
|
||||
replicas: 0
|
||||
compactor:
|
||||
replicas: 0
|
||||
indexGateway:
|
||||
replicas: 0
|
||||
bloomCompactor:
|
||||
replicas: 0
|
||||
bloomGateway:
|
||||
replicas: 0
|
||||
gateway:
|
||||
enabled: false
|
||||
test:
|
||||
enabled: false
|
||||
chunksCache:
|
||||
enabled: false
|
||||
resultsCache:
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
monitoring:
|
||||
selfMonitoring:
|
||||
enabled: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
@@ -0,0 +1,26 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
timeout: 20m
|
||||
targetNamespace: observability
|
||||
chartRef:
|
||||
kind: OCIRepository
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
image:
|
||||
pullPolicy: IfNotPresent
|
||||
config:
|
||||
clients:
|
||||
- url: http://observability-loki.observability.svc.cluster.local:3100/loki/api/v1/push
|
||||
@@ -0,0 +1,10 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ocirepository-loki.yaml
|
||||
- ocirepository-promtail.yaml
|
||||
- helmrelease-kube-prometheus-stack.yaml
|
||||
- helmrelease-loki.yaml
|
||||
- helmrelease-promtail.yaml
|
||||
- grafana-tailscale-service.yaml
|
||||
- prometheus-tailscale-service.yaml
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/grafana/helm-charts/loki
|
||||
ref:
|
||||
tag: 6.46.0
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: OCIRepository
|
||||
metadata:
|
||||
name: promtail
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
url: oci://ghcr.io/grafana/helm-charts/promtail
|
||||
ref:
|
||||
tag: 6.16.6
|
||||
layerSelector:
|
||||
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
|
||||
operation: copy
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-tailscale
|
||||
namespace: observability
|
||||
annotations:
|
||||
tailscale.com/hostname: prometheus
|
||||
tailscale.com/tags: "tag:prod,tag:prometheus"
|
||||
tailscale.com/proxy-class: infra-stable
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
app.kubernetes.io/name: prometheus
|
||||
operator.prometheus.io/name: observability-kube-prometh-prometheus
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
protocol: TCP
|
||||
targetPort: 9090
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- server-url-setting.yaml
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: management.cattle.io/v3
|
||||
kind: Setting
|
||||
metadata:
|
||||
name: server-url
|
||||
value: https://rancher.silverside-gopher.ts.net
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rancher-bootstrap-password-flux-externalsecret.yaml
|
||||
- rancher-bootstrap-password-externalsecret.yaml
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user