Compare commits
84 Commits
v0.1.0-sta
...
8d1f9f4944
| Author | SHA1 | Date | |
|---|---|---|---|
| 8d1f9f4944 | |||
| d4fd43e2f5 | |||
| 48a80c362c | |||
| fcf7f139ff | |||
| 7139ae322d | |||
| 528a8dc210 | |||
| 349f75729a | |||
| 522626a52b | |||
| 5bd4c41c2d | |||
| 3e41f71b1b | |||
| 9d2f30de32 | |||
| 08a3031276 | |||
| e3ce91db62 | |||
| bed8e4afc8 | |||
| 2d4de6cff8 | |||
| 4a83d981c8 | |||
| d188a51ef6 | |||
| 646ef16258 | |||
| 6f2e056b98 | |||
| e10a70475f | |||
| f95e0051a5 | |||
| 7c15ac5846 | |||
| 4c104f74e8 | |||
| be04602bfb | |||
| 06c1356f1e | |||
| 86fb5d5b90 | |||
| 8b403cd1d6 | |||
| 480a079dc8 | |||
| ff8e32daf5 | |||
| eb1ad0bea7 | |||
| 9ff9d1e633 | |||
| 6177b581e4 | |||
| b1e21c4a4b | |||
| 2f166ed9e7 | |||
| 1c39274df7 | |||
| 28eaa36ec4 | |||
| 02fa71c0aa | |||
| 2bbf05cdca | |||
| 213c1fb4e4 | |||
| 414ac73c25 | |||
| 542d7a6be5 | |||
| 210b617cc9 | |||
| 3686249e31 | |||
| f56d1447c1 | |||
| 63247b79a6 | |||
| f6e159406a | |||
| 0ae1c9395c | |||
| 272c5ddc6e | |||
| eb6bf3862a | |||
| 5a3f7550fe | |||
| a0ed6523ec | |||
| 4f61a840c7 | |||
| d876430703 | |||
| 56b6216257 | |||
| 91fe2e658c | |||
| 13cec1aa28 | |||
| bc133e65d3 | |||
| df4fdb5496 | |||
| cec7c42efb | |||
| ee692620b5 | |||
| a6d327fa1f | |||
| fe6cb39eaf | |||
| feaefd28a1 | |||
| 80ab59e22d | |||
| 6c0282e9d5 | |||
| 45aa616741 | |||
| b595c1738a | |||
| 1c4dfd7fae | |||
| 6b9fc1f6b8 | |||
| 2b5cad9d15 | |||
| 71a1495fbc | |||
| fe3814e0e3 | |||
| 5ab3c7a0ac | |||
| 9bc708ea4b | |||
| c0a4275f15 | |||
| 3dcf71a84f | |||
| 124fe94d0e | |||
| 2d3f63424a | |||
| 2a583d1bba | |||
| 27711e0661 | |||
| 10ee303995 | |||
| 558f34e2b1 | |||
| 58fabf23f8 | |||
| b30977a158 |
99
.gitea/workflows/dashboards.yml
Normal file
99
.gitea/workflows/dashboards.yml
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
name: Deploy Grafana Content
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "ansible/dashboards.yml"
|
||||||
|
- "ansible/roles/observability-content/**"
|
||||||
|
- ".gitea/workflows/dashboards.yml"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
TF_VERSION: "1.7.0"
|
||||||
|
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||||
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||||
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||||
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||||
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||||
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
dashboards:
|
||||||
|
name: Grafana Content
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Terraform
|
||||||
|
uses: hashicorp/setup-terraform@v3
|
||||||
|
with:
|
||||||
|
terraform_version: ${{ env.TF_VERSION }}
|
||||||
|
|
||||||
|
- name: Setup SSH Keys
|
||||||
|
run: |
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||||
|
chmod 600 ~/.ssh/id_ed25519
|
||||||
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||||
|
chmod 644 ~/.ssh/id_ed25519.pub
|
||||||
|
|
||||||
|
- name: Terraform Init
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
terraform init \
|
||||||
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
|
-backend-config="region=auto" \
|
||||||
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||||
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||||
|
-backend-config="skip_requesting_account_id=true"
|
||||||
|
|
||||||
|
- name: Detect runner egress IP
|
||||||
|
run: |
|
||||||
|
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||||
|
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||||
|
echo "Runner egress IP: ${RUNNER_IP}"
|
||||||
|
|
||||||
|
- name: Open SSH/API for current runner CIDR
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
terraform apply \
|
||||||
|
-refresh=false \
|
||||||
|
-target=hcloud_firewall.cluster \
|
||||||
|
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
|
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||||
|
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||||
|
-auto-approve
|
||||||
|
|
||||||
|
- name: Install Python Dependencies
|
||||||
|
run: |
|
||||||
|
apt-get update && apt-get install -y python3-pip
|
||||||
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||||
|
|
||||||
|
- name: Install Ansible Collections
|
||||||
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||||
|
|
||||||
|
- name: Generate Ansible Inventory
|
||||||
|
working-directory: ansible
|
||||||
|
run: python3 generate_inventory.py
|
||||||
|
|
||||||
|
- name: Apply dashboards and datasources
|
||||||
|
working-directory: ansible
|
||||||
|
run: |
|
||||||
|
ansible-playbook dashboards.yml \
|
||||||
|
-e "cluster_name=k8s-cluster"
|
||||||
|
env:
|
||||||
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
|
|
||||||
|
- name: Verify Grafana content resources
|
||||||
|
working-directory: ansible
|
||||||
|
run: |
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
|
||||||
|
env:
|
||||||
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
@@ -17,6 +17,8 @@ env:
|
|||||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||||
|
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
||||||
|
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
terraform:
|
terraform:
|
||||||
@@ -86,12 +88,8 @@ jobs:
|
|||||||
}
|
}
|
||||||
|
|
||||||
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
|
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
|
||||||
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
|
|
||||||
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
|
|
||||||
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
|
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
|
||||||
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
|
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
|
||||||
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
|
|
||||||
ensure_import 'hcloud_server.workers[3]' 'k8s-cluster-worker-4'
|
|
||||||
|
|
||||||
- name: Terraform Plan
|
- name: Terraform Plan
|
||||||
id: plan
|
id: plan
|
||||||
@@ -226,16 +224,62 @@ jobs:
|
|||||||
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||||
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
||||||
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
||||||
|
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
||||||
|
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
||||||
|
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
||||||
|
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
||||||
-e "cluster_name=k8s-cluster"
|
-e "cluster_name=k8s-cluster"
|
||||||
env:
|
env:
|
||||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
|
|
||||||
|
- name: Install kubectl
|
||||||
|
run: |
|
||||||
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||||
|
chmod +x /usr/local/bin/kubectl
|
||||||
|
|
||||||
|
- name: Rewrite kubeconfig for runner-reachable API
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
||||||
|
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
|
||||||
|
|
||||||
|
- name: Bootstrap Flux source and reconciliation graph
|
||||||
|
env:
|
||||||
|
KUBECONFIG: outputs/kubeconfig
|
||||||
|
FLUX_GIT_HOST: 64.176.189.59
|
||||||
|
FLUX_GIT_PORT: "2222"
|
||||||
|
run: |
|
||||||
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
||||||
|
kubectl -n flux-system create secret generic flux-system \
|
||||||
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
||||||
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
kubectl apply -k clusters/prod/flux-system
|
||||||
|
kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
|
||||||
|
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
|
||||||
|
kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||||
|
|
||||||
- name: Post-deploy cluster health checks
|
- name: Post-deploy cluster health checks
|
||||||
working-directory: ansible
|
working-directory: ansible
|
||||||
run: |
|
run: |
|
||||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
||||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
||||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pods -o wide"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pvc"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus"
|
||||||
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability describe svc kube-prometheus-stack-grafana"
|
||||||
env:
|
env:
|
||||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||||
|
|
||||||
|
|||||||
@@ -51,11 +51,57 @@ jobs:
|
|||||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||||
chmod 644 ~/.ssh/id_ed25519.pub
|
chmod 644 ~/.ssh/id_ed25519.pub
|
||||||
|
|
||||||
|
- name: Install jq
|
||||||
|
run: |
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y jq
|
||||||
|
|
||||||
- name: Terraform Destroy
|
- name: Terraform Destroy
|
||||||
|
id: destroy
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform destroy \
|
set +e
|
||||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
for attempt in 1 2 3; do
|
||||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
echo "Terraform destroy attempt ${attempt}/3"
|
||||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
terraform destroy \
|
||||||
-auto-approve
|
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
|
-auto-approve
|
||||||
|
rc=$?
|
||||||
|
if [ "$rc" -eq 0 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -lt 3 ]; then
|
||||||
|
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
|
||||||
|
sleep 30
|
||||||
|
terraform refresh \
|
||||||
|
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
exit "$rc"
|
||||||
|
|
||||||
|
- name: Hetzner destroy diagnostics
|
||||||
|
if: failure() && steps.destroy.outcome == 'failure'
|
||||||
|
env:
|
||||||
|
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
|
||||||
|
run: |
|
||||||
|
set +e
|
||||||
|
echo "== Terraform state list =="
|
||||||
|
terraform -chdir=terraform state list || true
|
||||||
|
|
||||||
|
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
|
||||||
|
if [ -z "$network_id" ]; then
|
||||||
|
network_id="11988935"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "== Hetzner network =="
|
||||||
|
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
|
||||||
|
|
||||||
|
echo "== Hetzner servers attached to network =="
|
||||||
|
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||||
|
|
||||||
|
echo "== Hetzner load balancers attached to network =="
|
||||||
|
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||||
|
|||||||
160
README.md
160
README.md
@@ -10,7 +10,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
|
|||||||
| **Workers** | 4x CX33 |
|
| **Workers** | 4x CX33 |
|
||||||
| **Total Cost** | €28.93/mo |
|
| **Total Cost** | €28.93/mo |
|
||||||
| **K8s** | k3s (latest, HA) |
|
| **K8s** | k3s (latest, HA) |
|
||||||
| **Addons** | Hetzner CCM + CSI |
|
| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
|
||||||
| **Access** | SSH/API restricted to Tailnet |
|
| **Access** | SSH/API restricted to Tailnet |
|
||||||
| **Bootstrap** | Terraform + Ansible |
|
| **Bootstrap** | Terraform + Ansible |
|
||||||
|
|
||||||
@@ -152,6 +152,7 @@ This repository includes Gitea workflows for:
|
|||||||
- **terraform-plan**: Runs on PRs, shows planned changes
|
- **terraform-plan**: Runs on PRs, shows planned changes
|
||||||
- **terraform-apply**: Runs on main branch after merge
|
- **terraform-apply**: Runs on main branch after merge
|
||||||
- **ansible-deploy**: Runs after terraform apply
|
- **ansible-deploy**: Runs after terraform apply
|
||||||
|
- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
|
||||||
|
|
||||||
### Required Gitea Secrets
|
### Required Gitea Secrets
|
||||||
|
|
||||||
@@ -166,10 +167,159 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
|
|||||||
| `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
|
| `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
|
||||||
| `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
|
| `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
|
||||||
| `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
|
| `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
|
||||||
|
| `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for Kubernetes Operator |
|
||||||
|
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
|
||||||
|
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
|
||||||
|
| `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
|
||||||
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
|
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
|
||||||
| `SSH_PUBLIC_KEY` | SSH public key content |
|
| `SSH_PUBLIC_KEY` | SSH public key content |
|
||||||
| `SSH_PRIVATE_KEY` | SSH private key content |
|
| `SSH_PRIVATE_KEY` | SSH private key content |
|
||||||
|
|
||||||
|
## GitOps (Flux)
|
||||||
|
|
||||||
|
This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap.
|
||||||
|
|
||||||
|
### Stable private-only baseline
|
||||||
|
|
||||||
|
The current default target is a deliberately simplified baseline:
|
||||||
|
|
||||||
|
- `1` control plane node
|
||||||
|
- `2` worker nodes
|
||||||
|
- private Hetzner network only
|
||||||
|
- Tailscale for operator access
|
||||||
|
- Flux-managed core addons only
|
||||||
|
|
||||||
|
Detailed phase gates and success criteria live in `STABLE_BASELINE.md`.
|
||||||
|
|
||||||
|
This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later.
|
||||||
|
|
||||||
|
### Runtime secrets
|
||||||
|
|
||||||
|
Runtime cluster secrets are moving to Doppler + External Secrets Operator.
|
||||||
|
|
||||||
|
- Doppler project: `hetznerterra`
|
||||||
|
- Initial auth: service token via `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
|
||||||
|
- First synced secrets:
|
||||||
|
- `GRAFANA_ADMIN_PASSWORD`
|
||||||
|
- `WEAVE_GITOPS_ADMIN_USERNAME`
|
||||||
|
- `WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH`
|
||||||
|
|
||||||
|
Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed by Doppler.
|
||||||
|
|
||||||
|
### Repository layout
|
||||||
|
|
||||||
|
- `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
|
||||||
|
- `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
|
||||||
|
- `infrastructure/`: infrastructure addon reconciliation graph
|
||||||
|
- `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
|
||||||
|
- `apps/`: application workload layer (currently scaffolded)
|
||||||
|
|
||||||
|
### Reconciliation graph
|
||||||
|
|
||||||
|
- `infrastructure` (top-level)
|
||||||
|
- `addon-ccm`
|
||||||
|
- `addon-csi` depends on `addon-ccm`
|
||||||
|
- `addon-tailscale-operator`
|
||||||
|
- `addon-observability`
|
||||||
|
- `addon-observability-content` depends on `addon-observability`
|
||||||
|
- `apps` depends on `infrastructure`
|
||||||
|
|
||||||
|
### Bootstrap notes
|
||||||
|
|
||||||
|
1. Install Flux controllers in `flux-system`.
|
||||||
|
2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
|
||||||
|
3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
|
||||||
|
4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.
|
||||||
|
|
||||||
|
### Current addon status
|
||||||
|
|
||||||
|
- Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
|
||||||
|
- Active Flux addons include `addon-ccm`, `addon-csi`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`, `addon-observability`, and `addon-observability-content`.
|
||||||
|
- Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons.
|
||||||
|
- `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success.
|
||||||
|
|
||||||
|
### Stable baseline acceptance
|
||||||
|
|
||||||
|
A rebuild is considered successful only when all of the following pass without manual intervention:
|
||||||
|
|
||||||
|
- Terraform create succeeds for the default `1` control plane and `2` workers.
|
||||||
|
- Ansible bootstrap succeeds end-to-end.
|
||||||
|
- All nodes become `Ready`.
|
||||||
|
- `hcloud-cloud-controller-manager` and `hcloud-csi` are `Ready`.
|
||||||
|
- Required External Secrets sync successfully.
|
||||||
|
- Tailscale private access works.
|
||||||
|
- Grafana and Prometheus are reachable privately.
|
||||||
|
- Terraform destroy succeeds cleanly or succeeds after workflow retries.
|
||||||
|
|
||||||
|
## Observability Stack
|
||||||
|
|
||||||
|
Flux deploys a lightweight observability stack in the `observability` namespace:
|
||||||
|
|
||||||
|
- `kube-prometheus-stack` (Prometheus + Grafana)
|
||||||
|
- `loki`
|
||||||
|
- `promtail`
|
||||||
|
|
||||||
|
Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/`.
|
||||||
|
|
||||||
|
Grafana and Prometheus are exposed through a single Tailscale front door backed by Traefik when the Tailscale Kubernetes Operator is healthy.
|
||||||
|
|
||||||
|
### Access Grafana and Prometheus
|
||||||
|
|
||||||
|
Preferred private access:
|
||||||
|
|
||||||
|
- Grafana: `http://k8s-cluster-cp-1.<your-tailnet>:30080/`
|
||||||
|
- Prometheus: `http://k8s-cluster-cp-1.<your-tailnet>:30990/`
|
||||||
|
- Flux UI: `http://k8s-cluster-cp-1.<your-tailnet>:30901/`
|
||||||
|
|
||||||
|
This access path is bootstrapped automatically by Ansible on `control_plane[0]` using persistent `kubectl port-forward` systemd services plus `tailscale serve`, so it survives cluster rebuilds.
|
||||||
|
|
||||||
|
Fallback (port-forward from a tailnet-connected machine):
|
||||||
|
|
||||||
|
Run from a tailnet-connected machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||||
|
|
||||||
|
kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
|
||||||
|
kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
|
||||||
|
```
|
||||||
|
|
||||||
|
Then open:
|
||||||
|
|
||||||
|
- Grafana: http://127.0.0.1:3000
|
||||||
|
- Prometheus: http://127.0.0.1:9090
|
||||||
|
|
||||||
|
Grafana user: `admin`
|
||||||
|
Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output)
|
||||||
|
|
||||||
|
### Verify Tailscale exposure
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export KUBECONFIG=$(pwd)/outputs/kubeconfig
|
||||||
|
|
||||||
|
kubectl -n tailscale-system get pods
|
||||||
|
kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus
|
||||||
|
kubectl -n observability describe svc kube-prometheus-stack-grafana | grep TailscaleProxyReady
|
||||||
|
kubectl -n observability describe svc kube-prometheus-stack-prometheus | grep TailscaleProxyReady
|
||||||
|
```
|
||||||
|
|
||||||
|
If `TailscaleProxyReady=False`, check:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl -n tailscale-system logs deployment/operator --tail=100
|
||||||
|
```
|
||||||
|
|
||||||
|
Common cause: OAuth client missing tag/scopes permissions.
|
||||||
|
|
||||||
|
### Fast dashboard iteration workflow
|
||||||
|
|
||||||
|
Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
|
||||||
|
It avoids full cluster provisioning and only applies Grafana content resources:
|
||||||
|
|
||||||
|
- `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
|
||||||
|
- `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
|
||||||
|
- `ansible/dashboards.yml`
|
||||||
|
|
||||||
## File Structure
|
## File Structure
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -191,13 +341,15 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
|
|||||||
│ │ ├── common/
|
│ │ ├── common/
|
||||||
│ │ ├── k3s-server/
|
│ │ ├── k3s-server/
|
||||||
│ │ ├── k3s-agent/
|
│ │ ├── k3s-agent/
|
||||||
│ │ ├── ccm/
|
│ │ ├── addon-secrets-bootstrap/
|
||||||
│ │ └── csi/
|
│ │ ├── observability-content/
|
||||||
|
│ │ └── observability/
|
||||||
│ └── ansible.cfg
|
│ └── ansible.cfg
|
||||||
├── .gitea/
|
├── .gitea/
|
||||||
│ └── workflows/
|
│ └── workflows/
|
||||||
│ ├── terraform.yml
|
│ ├── terraform.yml
|
||||||
│ └── ansible.yml
|
│ ├── ansible.yml
|
||||||
|
│ └── dashboards.yml
|
||||||
├── outputs/
|
├── outputs/
|
||||||
├── terraform.tfvars.example
|
├── terraform.tfvars.example
|
||||||
└── README.md
|
└── README.md
|
||||||
|
|||||||
93
SECRETS_SETUP.md
Normal file
93
SECRETS_SETUP.md
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# Gitea Secrets Setup
|
||||||
|
|
||||||
|
This document describes the secrets required for the HetznerTerra deployment workflow.
|
||||||
|
|
||||||
|
## Required Secrets
|
||||||
|
|
||||||
|
Add these secrets in your Gitea repository settings:
|
||||||
|
**Settings → Secrets → Actions**
|
||||||
|
|
||||||
|
### Infrastructure Secrets
|
||||||
|
|
||||||
|
#### `HCLOUD_TOKEN`
|
||||||
|
- Hetzner Cloud API token
|
||||||
|
- Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
|
||||||
|
- Permissions: Read & Write
|
||||||
|
|
||||||
|
#### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
|
||||||
|
- Backblaze B2 credentials for Terraform state storage
|
||||||
|
- Get from: https://secure.backblaze.com/b2_buckets.htm
|
||||||
|
- Create application key with access to your terraform state bucket
|
||||||
|
|
||||||
|
#### `S3_ENDPOINT`
|
||||||
|
- Backblaze B2 S3 endpoint
|
||||||
|
- Example: `https://s3.eu-central-003.backblazeb2.com`
|
||||||
|
|
||||||
|
#### `S3_BUCKET`
|
||||||
|
- Backblaze B2 bucket name for Terraform state
|
||||||
|
- Example: `k8s-terraform-state`
|
||||||
|
|
||||||
|
### SSH Secrets
|
||||||
|
|
||||||
|
#### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
|
||||||
|
- SSH key pair for cluster access
|
||||||
|
- Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
|
||||||
|
- Private key content (include BEGIN/END lines)
|
||||||
|
- Public key content (full line starting with ssh-ed25519)
|
||||||
|
|
||||||
|
### Tailscale Secrets
|
||||||
|
|
||||||
|
#### `TAILSCALE_AUTH_KEY`
|
||||||
|
- Tailscale auth key for node registration
|
||||||
|
- Get from: https://login.tailscale.com/admin/settings/keys
|
||||||
|
- Type: Reusable, Ephemeral
|
||||||
|
- Scope: `devices:core:write`
|
||||||
|
|
||||||
|
#### `TAILSCALE_TAILNET`
|
||||||
|
- Your Tailscale network name
|
||||||
|
- Example: `tail7ec33.ts.net` or your custom domain
|
||||||
|
|
||||||
|
#### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
|
||||||
|
- OAuth credentials for Tailscale Kubernetes Operator
|
||||||
|
- Get from: https://login.tailscale.com/admin/settings/oauth
|
||||||
|
- Create OAuth client with scope: `devices:core:write`
|
||||||
|
|
||||||
|
### Application Secrets
|
||||||
|
|
||||||
|
#### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
|
||||||
|
- Doppler service token for the `hetznerterra` project runtime secrets
|
||||||
|
- Used by External Secrets Operator bootstrap
|
||||||
|
- Recommended scope: `hetznerterra` project, `prod` config only
|
||||||
|
|
||||||
|
#### `GRAFANA_ADMIN_PASSWORD`
|
||||||
|
- Transitional fallback only while migrating observability secrets to Doppler
|
||||||
|
- In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
|
||||||
|
|
||||||
|
## Setting Up Secrets
|
||||||
|
|
||||||
|
1. Go to your Gitea repository
|
||||||
|
2. Navigate to **Settings → Secrets → Actions**
|
||||||
|
3. Click **Add Secret**
|
||||||
|
4. Enter the secret name (exact match from above)
|
||||||
|
5. Paste the secret value
|
||||||
|
6. Click **Add Secret**
|
||||||
|
7. Repeat for all secrets
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
After adding all secrets, trigger a workflow run:
|
||||||
|
```bash
|
||||||
|
git commit --allow-empty -m "ci: trigger workflow with new secrets"
|
||||||
|
git push
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the workflow logs to verify all secrets are being used correctly.
|
||||||
|
|
||||||
|
## Security Notes
|
||||||
|
|
||||||
|
- Never commit secrets to the repository
|
||||||
|
- Use strong, unique passwords for Grafana and other services
|
||||||
|
- Prefer Doppler for runtime app/platform secrets after cluster bootstrap
|
||||||
|
- Rotate Tailscale auth keys periodically
|
||||||
|
- Review OAuth client permissions regularly
|
||||||
|
- The workflow automatically opens SSH/API access only for the runner's IP during deployment
|
||||||
47
STABLE_BASELINE.md
Normal file
47
STABLE_BASELINE.md
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# Stable Private-Only Baseline
|
||||||
|
|
||||||
|
This document defines the current engineering target for this repository.
|
||||||
|
|
||||||
|
## Topology
|
||||||
|
|
||||||
|
- 1 control plane
|
||||||
|
- 2 workers
|
||||||
|
- private Hetzner network
|
||||||
|
- Tailscale operator access
|
||||||
|
|
||||||
|
## In Scope
|
||||||
|
|
||||||
|
- Terraform infrastructure bootstrap
|
||||||
|
- Ansible k3s bootstrap
|
||||||
|
- Flux core reconciliation
|
||||||
|
- Hetzner CCM
|
||||||
|
- Hetzner CSI
|
||||||
|
- External Secrets Operator with Doppler
|
||||||
|
- Tailscale private access
|
||||||
|
- Observability stack
|
||||||
|
|
||||||
|
## Out of Scope
|
||||||
|
|
||||||
|
- HA control plane
|
||||||
|
- public ingress or DNS
|
||||||
|
- public TLS
|
||||||
|
- app workloads
|
||||||
|
- DR / backup strategy
|
||||||
|
- upgrade strategy
|
||||||
|
|
||||||
|
## Phase Gates
|
||||||
|
|
||||||
|
1. Terraform apply completes for the default topology.
|
||||||
|
2. k3s server bootstrap completes and kubeconfig works.
|
||||||
|
3. Workers join and all nodes are Ready.
|
||||||
|
4. Flux source and infrastructure reconciliation are healthy.
|
||||||
|
5. CCM is Ready.
|
||||||
|
6. CSI is Ready and a PVC can bind.
|
||||||
|
7. External Secrets sync required secrets.
|
||||||
|
8. Tailscale private access works.
|
||||||
|
9. Observability is healthy and reachable privately.
|
||||||
|
10. Terraform destroy succeeds cleanly or via workflow retry.
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
The baseline is considered stable only after two consecutive fresh rebuilds pass all phase gates with no manual fixes.
|
||||||
7
ansible/dashboards.yml
Normal file
7
ansible/dashboards.yml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
- name: Provision Grafana dashboards and datasources
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- observability-content
|
||||||
41
ansible/roles/addon-secrets-bootstrap/tasks/main.yml
Normal file
41
ansible/roles/addon-secrets-bootstrap/tasks/main.yml
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
---
|
||||||
|
- name: Apply Hetzner cloud secret
|
||||||
|
shell: >-
|
||||||
|
kubectl -n kube-system create secret generic hcloud
|
||||||
|
--from-literal=token='{{ hcloud_token }}'
|
||||||
|
--from-literal=network='{{ cluster_name }}-network'
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
changed_when: true
|
||||||
|
no_log: true
|
||||||
|
when: hcloud_token | default('') | length > 0
|
||||||
|
|
||||||
|
- name: Ensure Tailscale operator namespace exists
|
||||||
|
command: >-
|
||||||
|
kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
|
||||||
|
--dry-run=client -o yaml
|
||||||
|
register: tailscale_namespace_manifest
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- tailscale_oauth_client_id | default('') | length > 0
|
||||||
|
- tailscale_oauth_client_secret | default('') | length > 0
|
||||||
|
|
||||||
|
- name: Apply Tailscale operator namespace
|
||||||
|
command: kubectl apply -f -
|
||||||
|
args:
|
||||||
|
stdin: "{{ tailscale_namespace_manifest.stdout }}"
|
||||||
|
changed_when: true
|
||||||
|
when:
|
||||||
|
- tailscale_oauth_client_id | default('') | length > 0
|
||||||
|
- tailscale_oauth_client_secret | default('') | length > 0
|
||||||
|
|
||||||
|
- name: Apply Tailscale operator OAuth secret
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
|
||||||
|
--from-literal=client_id='{{ tailscale_oauth_client_id }}'
|
||||||
|
--from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
changed_when: true
|
||||||
|
no_log: true
|
||||||
|
when:
|
||||||
|
- tailscale_oauth_client_id | default('') | length > 0
|
||||||
|
- tailscale_oauth_client_secret | default('') | length > 0
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
---
|
|
||||||
hcloud_token: ""
|
|
||||||
cluster_name: "k8s-cluster"
|
|
||||||
hcloud_lb_location: "nbg1"
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Check if Hetzner CCM is already deployed
|
|
||||||
command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
|
|
||||||
register: ccm_namespace
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Create Hetzner cloud secret
|
|
||||||
shell: |
|
|
||||||
kubectl -n kube-system create secret generic hcloud \
|
|
||||||
--from-literal=token='{{ hcloud_token }}' \
|
|
||||||
--from-literal=network='{{ cluster_name }}-network' \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
no_log: true
|
|
||||||
when: hcloud_token is defined
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Deploy Hetzner CCM
|
|
||||||
command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Detect CCM workload kind
|
|
||||||
shell: |
|
|
||||||
if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
|
|
||||||
echo deployment
|
|
||||||
elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
|
|
||||||
echo daemonset
|
|
||||||
else
|
|
||||||
echo missing
|
|
||||||
fi
|
|
||||||
register: ccm_workload_kind
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Wait for CCM deployment rollout
|
|
||||||
command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
|
|
||||||
register: ccm_rollout_deploy
|
|
||||||
until: ccm_rollout_deploy.rc == 0
|
|
||||||
changed_when: false
|
|
||||||
retries: 30
|
|
||||||
delay: 10
|
|
||||||
when: ccm_workload_kind.stdout == "deployment"
|
|
||||||
|
|
||||||
- name: Wait for CCM daemonset rollout
|
|
||||||
command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
|
|
||||||
register: ccm_rollout_ds
|
|
||||||
until: ccm_rollout_ds.rc == 0
|
|
||||||
changed_when: false
|
|
||||||
retries: 30
|
|
||||||
delay: 10
|
|
||||||
when: ccm_workload_kind.stdout == "daemonset"
|
|
||||||
|
|
||||||
- name: Set default Hetzner load balancer location for Traefik service
|
|
||||||
command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
|
|
||||||
register: traefik_annotation
|
|
||||||
changed_when: true
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Show Traefik service when annotation patch fails
|
|
||||||
command: kubectl -n kube-system get service traefik -o yaml
|
|
||||||
register: traefik_service_dump
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: traefik_annotation.rc != 0
|
|
||||||
|
|
||||||
- name: Fail when Traefik load balancer annotation cannot be set
|
|
||||||
fail:
|
|
||||||
msg: |
|
|
||||||
Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
|
|
||||||
Command output:
|
|
||||||
{{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
|
|
||||||
|
|
||||||
Service dump:
|
|
||||||
{{ traefik_service_dump.stdout | default('n/a') }}
|
|
||||||
when: traefik_annotation.rc != 0
|
|
||||||
|
|
||||||
- name: Show CCM namespace objects when workload missing
|
|
||||||
command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
|
|
||||||
register: ccm_ns_objects
|
|
||||||
changed_when: false
|
|
||||||
when: ccm_workload_kind.stdout == "missing"
|
|
||||||
|
|
||||||
- name: Fail when CCM workload is missing
|
|
||||||
fail:
|
|
||||||
msg: |
|
|
||||||
hcloud-cloud-controller-manager workload not found after applying manifest.
|
|
||||||
Namespace objects:
|
|
||||||
{{ ccm_ns_objects.stdout | default('n/a') }}
|
|
||||||
when: ccm_workload_kind.stdout == "missing"
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
---
|
|
||||||
hcloud_token: ""
|
|
||||||
cluster_name: "k8s-cluster"
|
|
||||||
csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
|
|
||||||
csi_rollout_timeout_seconds: 30
|
|
||||||
csi_rollout_retries: 8
|
|
||||||
csi_rollout_delay_seconds: 5
|
|
||||||
csi_failure_log_tail_lines: 120
|
|
||||||
csi_smoke_test_enabled: true
|
|
||||||
csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
|
|
||||||
csi_smoke_test_base_storage_class: "hcloud-volumes"
|
|
||||||
csi_smoke_test_size: "1Gi"
|
|
||||||
csi_smoke_test_pvc_timeout_seconds: 300
|
|
||||||
csi_smoke_test_job_timeout_seconds: 300
|
|
||||||
csi_smoke_test_required: false
|
|
||||||
@@ -1,425 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Create Hetzner CSI secret
|
|
||||||
shell: |
|
|
||||||
kubectl -n kube-system create secret generic hcloud \
|
|
||||||
--from-literal=token='{{ hcloud_token }}' \
|
|
||||||
--from-literal=network='{{ cluster_name }}-network' \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
no_log: true
|
|
||||||
when: hcloud_token is defined
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Deploy Hetzner CSI
|
|
||||||
command: kubectl apply -f {{ csi_manifest_url }}
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Ensure CSI controller endpoint is set for sidecars
|
|
||||||
command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Ensure CSI node endpoint is set for sidecars
|
|
||||||
command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Restart CSI controller to pick up current secret
|
|
||||||
command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
|
|
||||||
changed_when: true
|
|
||||||
|
|
||||||
- name: Wait for CSI controller deployment generation
|
|
||||||
command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Wait for CSI controller rollout
|
|
||||||
command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
|
|
||||||
register: csi_controller_rollout
|
|
||||||
until: csi_controller_rollout.rc == 0
|
|
||||||
retries: "{{ csi_rollout_retries | int }}"
|
|
||||||
delay: "{{ csi_rollout_delay_seconds | int }}"
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Show CSI controller status on failure
|
|
||||||
command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
|
|
||||||
register: csi_controller_deploy_status
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Show CSI controller pods on failure
|
|
||||||
command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
|
|
||||||
register: csi_controller_pods_status
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Describe CSI controller deployment on failure
|
|
||||||
command: kubectl -n kube-system describe deployment hcloud-csi-controller
|
|
||||||
register: csi_controller_deploy_describe
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Describe CSI controller pod on failure
|
|
||||||
shell: |
|
|
||||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
|
||||||
if [ -n "$pod" ]; then
|
|
||||||
kubectl -n kube-system describe pod "$pod"
|
|
||||||
fi
|
|
||||||
register: csi_controller_pod_describe
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Show CSI driver logs on failure
|
|
||||||
command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
|
|
||||||
register: csi_driver_logs
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Show CSI driver previous logs on failure
|
|
||||||
shell: |
|
|
||||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
|
||||||
if [ -n "$pod" ]; then
|
|
||||||
kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
|
|
||||||
fi
|
|
||||||
register: csi_driver_previous_logs
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Show sidecar previous logs on failure
|
|
||||||
shell: |
|
|
||||||
pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
|
||||||
if [ -n "$pod" ]; then
|
|
||||||
for container in csi-attacher csi-resizer csi-provisioner; do
|
|
||||||
echo "===== $container ====="
|
|
||||||
kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
register: csi_sidecar_previous_logs
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Show recent kube-system events on failure
|
|
||||||
command: kubectl -n kube-system get events --sort-by=.lastTimestamp
|
|
||||||
register: csi_recent_events
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Fail with CSI controller diagnostics
|
|
||||||
fail:
|
|
||||||
msg: |
|
|
||||||
CSI controller rollout failed.
|
|
||||||
Deployment status:
|
|
||||||
{{ csi_controller_deploy_status.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Pods status:
|
|
||||||
{{ csi_controller_pods_status.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Deployment describe:
|
|
||||||
{{ csi_controller_deploy_describe.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Pod describe:
|
|
||||||
{{ csi_controller_pod_describe.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
hcloud-csi-driver logs:
|
|
||||||
{{ csi_driver_logs.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
hcloud-csi-driver previous logs:
|
|
||||||
{{ csi_driver_previous_logs.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Sidecar previous logs:
|
|
||||||
{{ csi_sidecar_previous_logs.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Recent kube-system events:
|
|
||||||
{{ csi_recent_events.stdout | default('n/a') }}
|
|
||||||
when: csi_controller_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Wait for CSI node daemonset rollout
|
|
||||||
command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
|
|
||||||
register: csi_node_rollout
|
|
||||||
until: csi_node_rollout.rc == 0
|
|
||||||
retries: "{{ csi_rollout_retries | int }}"
|
|
||||||
delay: "{{ csi_rollout_delay_seconds | int }}"
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Fail when CSI node daemonset rollout does not complete
|
|
||||||
fail:
|
|
||||||
msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
|
|
||||||
when: csi_node_rollout.rc != 0
|
|
||||||
|
|
||||||
- name: Generate CSI smoke test run identifier
|
|
||||||
set_fact:
|
|
||||||
csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Generate unique CSI smoke test resource names
|
|
||||||
set_fact:
|
|
||||||
csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
|
|
||||||
csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Cleanup stale CSI smoke test resources before apply
|
|
||||||
shell: |
|
|
||||||
kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
|
|
||||||
kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Apply CSI smoke test resources
|
|
||||||
shell: |
|
|
||||||
kubectl apply -f - <<'EOF'
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
kind: StorageClass
|
|
||||||
metadata:
|
|
||||||
name: {{ csi_smoke_test_storage_class }}
|
|
||||||
provisioner: csi.hetzner.cloud
|
|
||||||
reclaimPolicy: Delete
|
|
||||||
volumeBindingMode: Immediate
|
|
||||||
allowVolumeExpansion: true
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ csi_smoke_test_pvc_name }}
|
|
||||||
namespace: kube-system
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: csi-smoke
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ csi_smoke_test_size }}
|
|
||||||
storageClassName: {{ csi_smoke_test_storage_class }}
|
|
||||||
---
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: {{ csi_smoke_test_job_name }}
|
|
||||||
namespace: kube-system
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: csi-smoke
|
|
||||||
spec:
|
|
||||||
backoffLimit: 0
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
restartPolicy: Never
|
|
||||||
containers:
|
|
||||||
- name: write-and-read
|
|
||||||
image: busybox:1.36
|
|
||||||
command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: data
|
|
||||||
mountPath: /data
|
|
||||||
volumes:
|
|
||||||
- name: data
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: {{ csi_smoke_test_pvc_name }}
|
|
||||||
EOF
|
|
||||||
changed_when: true
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Wait for CSI smoke PVC to bind
|
|
||||||
command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
|
|
||||||
register: csi_smoke_pvc_wait
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Wait for CSI smoke Job completion
|
|
||||||
command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
|
|
||||||
register: csi_smoke_job_wait
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc == 0
|
|
||||||
|
|
||||||
- name: Show CSI smoke job logs
|
|
||||||
command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
|
|
||||||
register: csi_smoke_job_logs
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
|
|
||||||
- name: Show CSI smoke PVC on failure
|
|
||||||
command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
|
|
||||||
register: csi_smoke_pvc_status
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Show CSI smoke Job on failure
|
|
||||||
command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
|
|
||||||
register: csi_smoke_job_status
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Show CSI smoke pods on failure
|
|
||||||
command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
|
|
||||||
register: csi_smoke_pod_status
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Describe CSI smoke PVC on failure
|
|
||||||
command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
|
|
||||||
register: csi_smoke_pvc_describe
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Show storage classes on failure
|
|
||||||
command: kubectl get storageclass
|
|
||||||
register: csi_storageclasses
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Get CSI controller pod name on smoke failure
|
|
||||||
shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
|
|
||||||
register: csi_controller_pod_name
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Describe CSI controller pod on smoke failure
|
|
||||||
command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
|
|
||||||
register: csi_controller_pod_smoke_describe
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
- csi_controller_pod_name.stdout | length > 0
|
|
||||||
|
|
||||||
- name: Show CSI controller container logs on smoke failure
|
|
||||||
shell: |
|
|
||||||
pod="{{ csi_controller_pod_name.stdout }}"
|
|
||||||
for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
|
|
||||||
echo "===== ${container}: current ====="
|
|
||||||
kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
|
|
||||||
echo "===== ${container}: previous ====="
|
|
||||||
kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
|
|
||||||
done
|
|
||||||
register: csi_controller_container_logs
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
- csi_controller_pod_name.stdout | length > 0
|
|
||||||
|
|
||||||
- name: Show CSI driver and node driver objects on smoke failure
|
|
||||||
shell: |
|
|
||||||
echo "===== CSIDriver ====="
|
|
||||||
kubectl get csidriver csi.hetzner.cloud -o yaml || true
|
|
||||||
echo "===== CSINode ====="
|
|
||||||
kubectl get csinode -o wide || true
|
|
||||||
register: csi_driver_objects
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Show CSI smoke pod describe on failure
|
|
||||||
shell: |
|
|
||||||
pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
|
|
||||||
if [ -n "$pod" ]; then
|
|
||||||
kubectl -n kube-system describe pod "$pod"
|
|
||||||
fi
|
|
||||||
register: csi_smoke_pod_describe
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
|
|
||||||
- name: Fail when CSI smoke test fails
|
|
||||||
fail:
|
|
||||||
msg: |
|
|
||||||
CSI smoke test failed.
|
|
||||||
PVC wait:
|
|
||||||
stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
|
|
||||||
stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
|
|
||||||
|
|
||||||
Job wait:
|
|
||||||
stdout: {{ csi_smoke_job_wait.stdout | default('') }}
|
|
||||||
stderr: {{ csi_smoke_job_wait.stderr | default('') }}
|
|
||||||
|
|
||||||
PVC:
|
|
||||||
{{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
Job:
|
|
||||||
{{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
Pod list:
|
|
||||||
{{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
PVC describe:
|
|
||||||
{{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
Storage classes:
|
|
||||||
{{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
CSI controller pod:
|
|
||||||
{{ csi_controller_pod_name.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
CSI controller pod describe:
|
|
||||||
{{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
CSI controller container logs:
|
|
||||||
{{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
CSI driver objects:
|
|
||||||
{{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
|
|
||||||
|
|
||||||
Pod describe:
|
|
||||||
{{ csi_smoke_pod_describe.stdout | default('n/a') }}
|
|
||||||
|
|
||||||
Job logs:
|
|
||||||
{{ csi_smoke_job_logs.stdout | default('n/a') }}
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
- csi_smoke_test_required | bool
|
|
||||||
|
|
||||||
- name: Warn when CSI smoke test fails but is non-blocking
|
|
||||||
debug:
|
|
||||||
msg: |
|
|
||||||
CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
|
|
||||||
PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
|
|
||||||
Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
|
|
||||||
when:
|
|
||||||
- csi_smoke_test_enabled | bool
|
|
||||||
- csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
|
|
||||||
- not (csi_smoke_test_required | bool)
|
|
||||||
|
|
||||||
- name: Cleanup CSI smoke test resources
|
|
||||||
shell: |
|
|
||||||
kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
|
|
||||||
kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
when: csi_smoke_test_enabled | bool
|
|
||||||
50
ansible/roles/doppler-bootstrap/tasks/main.yml
Normal file
50
ansible/roles/doppler-bootstrap/tasks/main.yml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure Doppler service token is provided
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- doppler_hetznerterra_service_token | length > 0
|
||||||
|
fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
|
||||||
|
|
||||||
|
- name: Ensure external-secrets namespace exists
|
||||||
|
shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Apply Doppler service token secret
|
||||||
|
shell: >-
|
||||||
|
kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
|
||||||
|
--from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Check for ClusterSecretStore CRD
|
||||||
|
command: kubectl get crd clustersecretstores.external-secrets.io
|
||||||
|
register: doppler_clustersecretstore_crd
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Apply Doppler ClusterSecretStore
|
||||||
|
shell: |
|
||||||
|
cat <<'EOF' | kubectl apply -f -
|
||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
metadata:
|
||||||
|
name: doppler-hetznerterra
|
||||||
|
spec:
|
||||||
|
provider:
|
||||||
|
doppler:
|
||||||
|
auth:
|
||||||
|
secretRef:
|
||||||
|
dopplerToken:
|
||||||
|
name: doppler-hetznerterra-service-token
|
||||||
|
key: dopplerToken
|
||||||
|
namespace: external-secrets
|
||||||
|
EOF
|
||||||
|
changed_when: true
|
||||||
|
when: doppler_clustersecretstore_crd.rc == 0
|
||||||
|
|
||||||
|
- name: Note pending Doppler ClusterSecretStore bootstrap
|
||||||
|
debug:
|
||||||
|
msg: >-
|
||||||
|
Skipping Doppler ClusterSecretStore bootstrap because the External Secrets CRD
|
||||||
|
is not available yet. Re-run after External Secrets is installed.
|
||||||
|
when: doppler_clustersecretstore_crd.rc != 0
|
||||||
@@ -3,3 +3,4 @@ k3s_version: latest
|
|||||||
k3s_server_url: ""
|
k3s_server_url: ""
|
||||||
k3s_token: ""
|
k3s_token: ""
|
||||||
k3s_node_ip: ""
|
k3s_node_ip: ""
|
||||||
|
k3s_kubelet_cloud_provider_external: false
|
||||||
|
|||||||
@@ -12,14 +12,41 @@
|
|||||||
when: not k3s_agent_binary.stat.exists
|
when: not k3s_agent_binary.stat.exists
|
||||||
|
|
||||||
- name: Install k3s agent
|
- name: Install k3s agent
|
||||||
environment:
|
|
||||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
|
||||||
K3S_URL: "{{ k3s_server_url }}"
|
|
||||||
K3S_TOKEN: "{{ k3s_token }}"
|
|
||||||
command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
|
|
||||||
args:
|
|
||||||
creates: /usr/local/bin/k3s-agent
|
|
||||||
when: not k3s_agent_binary.stat.exists
|
when: not k3s_agent_binary.stat.exists
|
||||||
|
block:
|
||||||
|
- name: Run k3s agent install
|
||||||
|
environment:
|
||||||
|
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||||
|
K3S_URL: "{{ k3s_server_url }}"
|
||||||
|
K3S_TOKEN: "{{ k3s_token }}"
|
||||||
|
command: >-
|
||||||
|
/tmp/install-k3s.sh agent
|
||||||
|
--node-ip {{ k3s_node_ip }}
|
||||||
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
|
args:
|
||||||
|
creates: /usr/local/bin/k3s-agent
|
||||||
|
rescue:
|
||||||
|
- name: Show k3s-agent service status after failed install
|
||||||
|
command: systemctl status k3s-agent --no-pager
|
||||||
|
register: k3s_agent_status_after_install
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Show recent k3s-agent logs after failed install
|
||||||
|
command: journalctl -u k3s-agent -n 120 --no-pager
|
||||||
|
register: k3s_agent_journal_after_install
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Fail with k3s-agent diagnostics
|
||||||
|
fail:
|
||||||
|
msg: |
|
||||||
|
k3s agent install failed on {{ inventory_hostname }}.
|
||||||
|
Service status:
|
||||||
|
{{ k3s_agent_status_after_install.stdout | default('n/a') }}
|
||||||
|
|
||||||
|
Recent logs:
|
||||||
|
{{ k3s_agent_journal_after_install.stdout | default('n/a') }}
|
||||||
|
|
||||||
- name: Wait for k3s agent to be ready
|
- name: Wait for k3s agent to be ready
|
||||||
command: systemctl is-active k3s-agent
|
command: systemctl is-active k3s-agent
|
||||||
|
|||||||
@@ -3,3 +3,6 @@ k3s_version: latest
|
|||||||
k3s_token: ""
|
k3s_token: ""
|
||||||
k3s_node_ip: ""
|
k3s_node_ip: ""
|
||||||
k3s_primary_public_ip: ""
|
k3s_primary_public_ip: ""
|
||||||
|
k3s_disable_embedded_ccm: true
|
||||||
|
k3s_disable_servicelb: true
|
||||||
|
k3s_kubelet_cloud_provider_external: false
|
||||||
|
|||||||
@@ -28,27 +28,22 @@
|
|||||||
stat:
|
stat:
|
||||||
path: /usr/local/bin/k3s-uninstall.sh
|
path: /usr/local/bin/k3s-uninstall.sh
|
||||||
register: k3s_uninstall_script
|
register: k3s_uninstall_script
|
||||||
when:
|
when: k3s_install_needed
|
||||||
- not (k3s_primary | default(false))
|
|
||||||
- k3s_install_needed
|
|
||||||
|
|
||||||
- name: Reset broken secondary k3s install before rejoin
|
- name: Reset broken k3s install before reinstall
|
||||||
command: /usr/local/bin/k3s-uninstall.sh
|
command: /usr/local/bin/k3s-uninstall.sh
|
||||||
when:
|
when:
|
||||||
- not (k3s_primary | default(false))
|
|
||||||
- k3s_install_needed
|
- k3s_install_needed
|
||||||
- k3s_uninstall_script.stat.exists
|
- k3s_uninstall_script.stat.exists
|
||||||
|
|
||||||
- name: Remove stale k3s data on secondary
|
- name: Remove stale k3s data
|
||||||
file:
|
file:
|
||||||
path: "{{ item }}"
|
path: "{{ item }}"
|
||||||
state: absent
|
state: absent
|
||||||
loop:
|
loop:
|
||||||
- /etc/rancher/k3s
|
- /etc/rancher/k3s
|
||||||
- /var/lib/rancher/k3s
|
- /var/lib/rancher/k3s
|
||||||
when:
|
when: k3s_install_needed
|
||||||
- not (k3s_primary | default(false))
|
|
||||||
- k3s_install_needed
|
|
||||||
|
|
||||||
- name: Download k3s install script
|
- name: Download k3s install script
|
||||||
get_url:
|
get_url:
|
||||||
@@ -61,7 +56,16 @@
|
|||||||
environment:
|
environment:
|
||||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||||
K3S_TOKEN: "{{ k3s_token }}"
|
K3S_TOKEN: "{{ k3s_token }}"
|
||||||
command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
|
command: >-
|
||||||
|
/tmp/install-k3s.sh server
|
||||||
|
--cluster-init
|
||||||
|
--advertise-address={{ k3s_primary_ip }}
|
||||||
|
--node-ip={{ k3s_node_ip }}
|
||||||
|
--tls-san={{ k3s_primary_ip }}
|
||||||
|
--tls-san={{ k3s_primary_public_ip }}
|
||||||
|
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||||
|
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||||
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
when:
|
when:
|
||||||
- k3s_install_needed
|
- k3s_install_needed
|
||||||
- k3s_primary | default(false)
|
- k3s_primary | default(false)
|
||||||
@@ -75,7 +79,14 @@
|
|||||||
environment:
|
environment:
|
||||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||||
K3S_TOKEN: "{{ k3s_token }}"
|
K3S_TOKEN: "{{ k3s_token }}"
|
||||||
command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
|
command: >-
|
||||||
|
/tmp/install-k3s.sh server
|
||||||
|
--server https://{{ k3s_primary_ip }}:6443
|
||||||
|
--advertise-address={{ k3s_node_ip }}
|
||||||
|
--node-ip={{ k3s_node_ip }}
|
||||||
|
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
|
||||||
|
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||||
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
register: secondary_install
|
register: secondary_install
|
||||||
|
|
||||||
rescue:
|
rescue:
|
||||||
|
|||||||
9
ansible/roles/observability-content/defaults/main.yml
Normal file
9
ansible/roles/observability-content/defaults/main.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
observability_namespace: "observability"
|
||||||
|
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
|
||||||
|
grafana_datasource_configmap_name: "grafana-datasources-core"
|
||||||
|
loki_enabled: true
|
||||||
|
grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
|
||||||
|
grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
|
||||||
|
grafana_use_prometheus_nodeport_fallback: true
|
||||||
|
grafana_use_loki_nodeport_fallback: true
|
||||||
173
ansible/roles/observability-content/tasks/main.yml
Normal file
173
ansible/roles/observability-content/tasks/main.yml
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure observability namespace exists
|
||||||
|
command: kubectl create namespace {{ observability_namespace }}
|
||||||
|
register: create_observability_ns
|
||||||
|
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||||
|
changed_when: create_observability_ns.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for Grafana deployment rollout
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set default Prometheus datasource URL
|
||||||
|
set_fact:
|
||||||
|
grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
|
||||||
|
grafana_loki_effective_url: "{{ grafana_loki_url }}"
|
||||||
|
|
||||||
|
- name: Get Grafana pod name
|
||||||
|
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
|
||||||
|
register: grafana_pod_name
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Probe Prometheus from Grafana pod via default datasource URL
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||||
|
sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
|
||||||
|
register: grafana_prometheus_probe
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Probe Loki from Grafana pod via default datasource URL
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||||
|
sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
|
||||||
|
register: grafana_loki_probe
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Get Prometheus pod host IP for fallback
|
||||||
|
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
|
||||||
|
register: prometheus_host_ip
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- grafana_use_prometheus_nodeport_fallback | bool
|
||||||
|
- grafana_prometheus_probe.rc != 0
|
||||||
|
|
||||||
|
- name: Get Prometheus service NodePort for fallback
|
||||||
|
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
|
||||||
|
register: prometheus_nodeport
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- grafana_use_prometheus_nodeport_fallback | bool
|
||||||
|
- grafana_prometheus_probe.rc != 0
|
||||||
|
|
||||||
|
- name: Enable Prometheus NodePort fallback datasource URL
|
||||||
|
set_fact:
|
||||||
|
grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
|
||||||
|
when:
|
||||||
|
- grafana_use_prometheus_nodeport_fallback | bool
|
||||||
|
- grafana_prometheus_probe.rc != 0
|
||||||
|
- prometheus_host_ip.stdout | length > 0
|
||||||
|
- prometheus_nodeport.stdout | length > 0
|
||||||
|
|
||||||
|
- name: Ensure Loki service uses NodePort for fallback
|
||||||
|
command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when:
|
||||||
|
- loki_enabled
|
||||||
|
- grafana_use_loki_nodeport_fallback | bool
|
||||||
|
- grafana_loki_probe.rc != 0
|
||||||
|
|
||||||
|
- name: Get Loki pod host IP for fallback
|
||||||
|
command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
|
||||||
|
register: loki_host_ip
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- loki_enabled
|
||||||
|
- grafana_use_loki_nodeport_fallback | bool
|
||||||
|
- grafana_loki_probe.rc != 0
|
||||||
|
|
||||||
|
- name: Get Loki service NodePort for fallback
|
||||||
|
command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
|
||||||
|
register: loki_nodeport
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- loki_enabled
|
||||||
|
- grafana_use_loki_nodeport_fallback | bool
|
||||||
|
- grafana_loki_probe.rc != 0
|
||||||
|
|
||||||
|
- name: Enable Loki NodePort fallback datasource URL
|
||||||
|
set_fact:
|
||||||
|
grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
|
||||||
|
when:
|
||||||
|
- loki_enabled
|
||||||
|
- grafana_use_loki_nodeport_fallback | bool
|
||||||
|
- grafana_loki_probe.rc != 0
|
||||||
|
- loki_host_ip.stdout | length > 0
|
||||||
|
- loki_nodeport.stdout | length > 0
|
||||||
|
|
||||||
|
- name: Query Loki labels endpoint from Grafana pod
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||||
|
sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
|
||||||
|
register: grafana_loki_labels
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Fail when Loki is reachable but has zero indexed labels
|
||||||
|
fail:
|
||||||
|
msg: >-
|
||||||
|
Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
|
||||||
|
This usually means no logs are ingested yet. Check Promtail and tenant configuration.
|
||||||
|
when:
|
||||||
|
- loki_enabled
|
||||||
|
- grafana_loki_labels.rc == 0
|
||||||
|
- "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
|
||||||
|
- "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
|
||||||
|
|
||||||
|
- name: Write default Prometheus datasource ConfigMap patch
|
||||||
|
template:
|
||||||
|
src: grafana-default-prometheus-datasource.yaml.j2
|
||||||
|
dest: /tmp/grafana-default-prometheus-datasource.yaml
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Apply default Prometheus datasource ConfigMap patch
|
||||||
|
command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Remove legacy Loki datasource ConfigMap
|
||||||
|
command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Write Grafana datasources ConfigMap
|
||||||
|
template:
|
||||||
|
src: grafana-datasources.yaml.j2
|
||||||
|
dest: /tmp/grafana-datasources.yaml
|
||||||
|
mode: "0644"
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Apply Grafana datasources ConfigMap
|
||||||
|
command: kubectl apply -f /tmp/grafana-datasources.yaml
|
||||||
|
changed_when: true
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Restart Grafana to load datasource updates deterministically
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Wait for Grafana rollout after datasource update
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Write Grafana dashboard ConfigMap
|
||||||
|
template:
|
||||||
|
src: grafana-dashboard-k8s-overview.yaml.j2
|
||||||
|
dest: /tmp/grafana-dashboard-k8s-overview.yaml
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Apply Grafana dashboard ConfigMap
|
||||||
|
command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Show Grafana content provisioning summary
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
Grafana content applied.
|
||||||
|
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
|
||||||
|
Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
|
||||||
|
Loki datasource URL: {{ grafana_loki_effective_url }}
|
||||||
|
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: {{ grafana_dashboard_configmap_name }}
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
data:
|
||||||
|
k8s-overview.json: |
|
||||||
|
{
|
||||||
|
"annotations": {"list": []},
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
|
"id": 1,
|
||||||
|
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||||
|
"legendFormat": "ready",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"type": "stat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"id": 2,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||||
|
"legendFormat": "cpu",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Cluster CPU Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["kubernetes", "infrastructure"],
|
||||||
|
"templating": {"list": []},
|
||||||
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "K8s Cluster Overview",
|
||||||
|
"uid": "k8s-cluster-overview",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: {{ grafana_datasource_configmap_name }}
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
datasources.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
{% if loki_enabled %}
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: "{{ grafana_loki_effective_url }}"
|
||||||
|
isDefault: false
|
||||||
|
{% endif %}
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: kube-prometheus-stack-grafana-datasource
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
data:
|
||||||
|
datasource.yaml: |-
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: "Prometheus"
|
||||||
|
type: prometheus
|
||||||
|
uid: prometheus
|
||||||
|
url: "{{ grafana_prometheus_effective_url }}/"
|
||||||
|
access: proxy
|
||||||
|
isDefault: true
|
||||||
|
jsonData:
|
||||||
|
httpMethod: POST
|
||||||
|
timeInterval: 30s
|
||||||
|
- name: "Alertmanager"
|
||||||
|
type: alertmanager
|
||||||
|
uid: alertmanager
|
||||||
|
url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
|
||||||
|
access: proxy
|
||||||
|
jsonData:
|
||||||
|
handleGrafanaManagedAlerts: false
|
||||||
|
implementation: prometheus
|
||||||
27
ansible/roles/observability/defaults/main.yml
Normal file
27
ansible/roles/observability/defaults/main.yml
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
observability_namespace: "observability"
|
||||||
|
|
||||||
|
prometheus_chart_version: "68.4.4"
|
||||||
|
loki_chart_version: "6.10.0"
|
||||||
|
promtail_chart_version: "6.16.6"
|
||||||
|
|
||||||
|
grafana_admin_password: ""
|
||||||
|
|
||||||
|
prometheus_storage_size: "10Gi"
|
||||||
|
grafana_storage_size: "5Gi"
|
||||||
|
loki_storage_size: "10Gi"
|
||||||
|
|
||||||
|
prometheus_storage_class: "local-path"
|
||||||
|
grafana_storage_class: "local-path"
|
||||||
|
loki_storage_class: "local-path"
|
||||||
|
|
||||||
|
loki_enabled: true
|
||||||
|
|
||||||
|
tailscale_oauth_client_id: ""
|
||||||
|
tailscale_oauth_client_secret: ""
|
||||||
|
tailscale_tailnet: ""
|
||||||
|
|
||||||
|
observability_tailscale_expose: true
|
||||||
|
grafana_tailscale_hostname: "grafana"
|
||||||
|
prometheus_tailscale_hostname: "prometheus"
|
||||||
|
tailscale_proxyclass_name: "infra-stable"
|
||||||
252
ansible/roles/observability/tasks/main.yml
Normal file
252
ansible/roles/observability/tasks/main.yml
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
---
|
||||||
|
- name: Check if Helm is installed
|
||||||
|
command: helm version --short
|
||||||
|
register: helm_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Install Helm
|
||||||
|
shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||||
|
when: helm_check.rc != 0
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Ensure observability namespace exists
|
||||||
|
command: kubectl create namespace {{ observability_namespace }}
|
||||||
|
register: create_observability_ns
|
||||||
|
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
|
||||||
|
changed_when: create_observability_ns.rc == 0
|
||||||
|
|
||||||
|
- name: Set Grafana admin password
|
||||||
|
set_fact:
|
||||||
|
grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
|
||||||
|
|
||||||
|
- name: Write kube-prometheus-stack values
|
||||||
|
template:
|
||||||
|
src: kube-prometheus-stack-values.yaml.j2
|
||||||
|
dest: /tmp/kube-prometheus-stack-values.yaml
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Add Prometheus Helm repo
|
||||||
|
command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||||
|
register: add_prom_repo
|
||||||
|
failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
|
||||||
|
changed_when: add_prom_repo.rc == 0
|
||||||
|
|
||||||
|
- name: Add Grafana Helm repo
|
||||||
|
command: helm repo add grafana https://grafana.github.io/helm-charts
|
||||||
|
register: add_grafana_repo
|
||||||
|
failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
|
||||||
|
changed_when: add_grafana_repo.rc == 0
|
||||||
|
|
||||||
|
- name: Update Helm repos
|
||||||
|
command: helm repo update
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Clear stale pending Helm revision secrets for kube-prometheus-stack
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} delete
|
||||||
|
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
|
||||||
|
--ignore-not-found=true;
|
||||||
|
kubectl -n {{ observability_namespace }} delete
|
||||||
|
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
|
||||||
|
--ignore-not-found=true;
|
||||||
|
kubectl -n {{ observability_namespace }} delete
|
||||||
|
$(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
|
||||||
|
--ignore-not-found=true
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Install kube-prometheus-stack
|
||||||
|
command: >-
|
||||||
|
helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
|
||||||
|
--namespace {{ observability_namespace }}
|
||||||
|
--version {{ prometheus_chart_version }}
|
||||||
|
--values /tmp/kube-prometheus-stack-values.yaml
|
||||||
|
--wait
|
||||||
|
--timeout 10m
|
||||||
|
register: kube_prom_install
|
||||||
|
retries: 12
|
||||||
|
delay: 15
|
||||||
|
until: kube_prom_install.rc == 0
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Wait for Grafana deployment rollout
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Reset Grafana admin password in Grafana database
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} exec
|
||||||
|
"$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
|
||||||
|
-c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Write Loki values
|
||||||
|
template:
|
||||||
|
src: loki-values.yaml.j2
|
||||||
|
dest: /tmp/loki-values.yaml
|
||||||
|
mode: "0644"
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Validate Loki chart produces resources
|
||||||
|
command: >-
|
||||||
|
helm template loki grafana/loki
|
||||||
|
--namespace {{ observability_namespace }}
|
||||||
|
--version {{ loki_chart_version }}
|
||||||
|
--values /tmp/loki-values.yaml
|
||||||
|
register: loki_template
|
||||||
|
changed_when: false
|
||||||
|
failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Remove legacy Loki resources
|
||||||
|
command: >-
|
||||||
|
kubectl -n {{ observability_namespace }} delete
|
||||||
|
deployment/loki-gateway
|
||||||
|
statefulset/loki
|
||||||
|
statefulset/loki-chunks-cache
|
||||||
|
statefulset/loki-results-cache
|
||||||
|
statefulset/loki-backend
|
||||||
|
statefulset/loki-read
|
||||||
|
statefulset/loki-write
|
||||||
|
poddisruptionbudget/loki-memcached-chunks-cache
|
||||||
|
poddisruptionbudget/loki-memcached-results-cache
|
||||||
|
--ignore-not-found=true
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Clear stuck Helm lock for Loki
|
||||||
|
command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Uninstall failed Loki release (if stuck)
|
||||||
|
command: helm uninstall loki -n {{ observability_namespace }}
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Install Loki
|
||||||
|
command: >-
|
||||||
|
helm upgrade --install loki grafana/loki
|
||||||
|
--namespace {{ observability_namespace }}
|
||||||
|
--version {{ loki_chart_version }}
|
||||||
|
--values /tmp/loki-values.yaml
|
||||||
|
register: loki_install
|
||||||
|
changed_when: true
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Wait for Loki StatefulSet
|
||||||
|
command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
|
||||||
|
register: loki_rollout
|
||||||
|
changed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Show Loki pod status
|
||||||
|
command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
|
||||||
|
register: loki_pods
|
||||||
|
changed_when: false
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Debug Loki pods
|
||||||
|
debug:
|
||||||
|
msg: "{{ loki_pods.stdout }}"
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Write Promtail values
|
||||||
|
template:
|
||||||
|
src: promtail-values.yaml.j2
|
||||||
|
dest: /tmp/promtail-values.yaml
|
||||||
|
mode: "0644"
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Install Promtail
|
||||||
|
command: >-
|
||||||
|
helm upgrade --install promtail grafana/promtail
|
||||||
|
--namespace {{ observability_namespace }}
|
||||||
|
--version {{ promtail_chart_version }}
|
||||||
|
--values /tmp/promtail-values.yaml
|
||||||
|
--wait
|
||||||
|
--timeout 10m
|
||||||
|
changed_when: true
|
||||||
|
when: loki_enabled
|
||||||
|
|
||||||
|
- name: Check Tailscale service readiness for Grafana
|
||||||
|
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||||
|
register: grafana_tailscale_ready
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when:
|
||||||
|
- observability_tailscale_expose | bool
|
||||||
|
- tailscale_operator_ready | default(false) | bool
|
||||||
|
|
||||||
|
- name: Check Tailscale service readiness for Prometheus
|
||||||
|
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
|
||||||
|
register: prometheus_tailscale_ready
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when:
|
||||||
|
- observability_tailscale_expose | bool
|
||||||
|
- tailscale_operator_ready | default(false) | bool
|
||||||
|
|
||||||
|
- name: Check Tailscale endpoint (IP/hostname) for Grafana
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
|
||||||
|
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
|
||||||
|
register: grafana_lb_ip
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when:
|
||||||
|
- observability_tailscale_expose | bool
|
||||||
|
- tailscale_operator_ready | default(false) | bool
|
||||||
|
|
||||||
|
- name: Check Tailscale endpoint (IP/hostname) for Prometheus
|
||||||
|
shell: >-
|
||||||
|
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
|
||||||
|
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
|
||||||
|
register: prometheus_lb_ip
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when:
|
||||||
|
- observability_tailscale_expose | bool
|
||||||
|
- tailscale_operator_ready | default(false) | bool
|
||||||
|
|
||||||
|
- name: Show Tailscale access details
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
Observability stack deployed with Tailscale access!
|
||||||
|
|
||||||
|
Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
|
||||||
|
Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
|
||||||
|
|
||||||
|
Login: admin / {{ grafana_password_effective }}
|
||||||
|
|
||||||
|
Tailscale readiness:
|
||||||
|
- Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
|
||||||
|
- Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
|
||||||
|
|
||||||
|
Access via:
|
||||||
|
- MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
|
||||||
|
- Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
|
||||||
|
- Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
|
||||||
|
when:
|
||||||
|
- observability_tailscale_expose | bool
|
||||||
|
- tailscale_operator_ready | default(false) | bool
|
||||||
|
|
||||||
|
- name: Show observability access details (fallback)
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
Observability stack deployed.
|
||||||
|
Namespace: {{ observability_namespace }}
|
||||||
|
Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
|
||||||
|
Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
|
||||||
|
Grafana admin password: {{ grafana_password_effective }}
|
||||||
|
{% if loki_enabled %}
|
||||||
|
Loki: Enabled - logs available in Grafana
|
||||||
|
{% else %}
|
||||||
|
Loki: Disabled
|
||||||
|
{% endif %}
|
||||||
|
when:
|
||||||
|
- not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-datasource-loki
|
||||||
|
namespace: {{ observability_namespace }}
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
loki-datasource.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
|
||||||
|
isDefault: false
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
grafana:
|
||||||
|
enabled: true
|
||||||
|
adminPassword: {{ grafana_password_effective }}
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClassName: {{ grafana_storage_class }}
|
||||||
|
size: {{ grafana_storage_size }}
|
||||||
|
service:
|
||||||
|
{% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
|
||||||
|
type: LoadBalancer
|
||||||
|
loadBalancerClass: tailscale
|
||||||
|
annotations:
|
||||||
|
tailscale.com/hostname: {{ grafana_tailscale_hostname }}
|
||||||
|
tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
|
||||||
|
{% else %}
|
||||||
|
type: ClusterIP
|
||||||
|
{% endif %}
|
||||||
|
prometheus:
|
||||||
|
service:
|
||||||
|
{% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
|
||||||
|
type: LoadBalancer
|
||||||
|
loadBalancerClass: tailscale
|
||||||
|
annotations:
|
||||||
|
tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
|
||||||
|
tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
|
||||||
|
{% else %}
|
||||||
|
type: ClusterIP
|
||||||
|
{% endif %}
|
||||||
|
prometheusSpec:
|
||||||
|
retention: 7d
|
||||||
|
storageSpec:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: {{ prometheus_storage_class }}
|
||||||
|
accessModes: ["ReadWriteOnce"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: {{ prometheus_storage_size }}
|
||||||
|
alertmanager:
|
||||||
|
enabled: false
|
||||||
|
kubeEtcd:
|
||||||
|
enabled: false
|
||||||
|
kubeControllerManager:
|
||||||
|
enabled: false
|
||||||
|
kubeScheduler:
|
||||||
|
enabled: false
|
||||||
75
ansible/roles/observability/templates/loki-values.yaml.j2
Normal file
75
ansible/roles/observability/templates/loki-values.yaml.j2
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
deploymentMode: SingleBinary
|
||||||
|
|
||||||
|
loki:
|
||||||
|
auth_enabled: false
|
||||||
|
commonConfig:
|
||||||
|
replication_factor: 1
|
||||||
|
schemaConfig:
|
||||||
|
configs:
|
||||||
|
- from: "2024-04-01"
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: loki_index_
|
||||||
|
period: 24h
|
||||||
|
storage:
|
||||||
|
type: filesystem
|
||||||
|
limits_config:
|
||||||
|
allow_structured_metadata: true
|
||||||
|
volume_enabled: true
|
||||||
|
retention_period: 168h
|
||||||
|
pattern_ingester:
|
||||||
|
enabled: true
|
||||||
|
ruler:
|
||||||
|
enable_api: true
|
||||||
|
|
||||||
|
singleBinary:
|
||||||
|
replicas: 1
|
||||||
|
persistence:
|
||||||
|
size: {{ loki_storage_size }}
|
||||||
|
storageClass: {{ loki_storage_class }}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
|
||||||
|
backend:
|
||||||
|
replicas: 0
|
||||||
|
read:
|
||||||
|
replicas: 0
|
||||||
|
write:
|
||||||
|
replicas: 0
|
||||||
|
ingester:
|
||||||
|
replicas: 0
|
||||||
|
querier:
|
||||||
|
replicas: 0
|
||||||
|
queryFrontend:
|
||||||
|
replicas: 0
|
||||||
|
queryScheduler:
|
||||||
|
replicas: 0
|
||||||
|
distributor:
|
||||||
|
replicas: 0
|
||||||
|
compactor:
|
||||||
|
replicas: 0
|
||||||
|
indexGateway:
|
||||||
|
replicas: 0
|
||||||
|
bloomCompactor:
|
||||||
|
replicas: 0
|
||||||
|
bloomGateway:
|
||||||
|
replicas: 0
|
||||||
|
|
||||||
|
gateway:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
test:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
selfMonitoring:
|
||||||
|
enabled: false
|
||||||
|
lokiCanary:
|
||||||
|
enabled: false
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
config:
|
||||||
|
clients:
|
||||||
|
- url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
|
||||||
58
ansible/roles/private-access/tasks/main.yml
Normal file
58
ansible/roles/private-access/tasks/main.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
- name: Create systemd unit for Grafana private access
|
||||||
|
template:
|
||||||
|
src: kubectl-port-forward.service.j2
|
||||||
|
dest: /etc/systemd/system/k8s-portforward-grafana.service
|
||||||
|
mode: "0644"
|
||||||
|
vars:
|
||||||
|
unit_description: Port-forward Grafana for Tailscale access
|
||||||
|
unit_namespace: observability
|
||||||
|
unit_target: svc/observability-kube-prometheus-stack-grafana
|
||||||
|
unit_local_port: 13080
|
||||||
|
unit_remote_port: 80
|
||||||
|
|
||||||
|
- name: Create systemd unit for Prometheus private access
|
||||||
|
template:
|
||||||
|
src: kubectl-port-forward.service.j2
|
||||||
|
dest: /etc/systemd/system/k8s-portforward-prometheus.service
|
||||||
|
mode: "0644"
|
||||||
|
vars:
|
||||||
|
unit_description: Port-forward Prometheus for Tailscale access
|
||||||
|
unit_namespace: observability
|
||||||
|
unit_target: svc/observability-kube-prometh-prometheus
|
||||||
|
unit_local_port: 19090
|
||||||
|
unit_remote_port: 9090
|
||||||
|
|
||||||
|
- name: Create systemd unit for Flux UI private access
|
||||||
|
template:
|
||||||
|
src: kubectl-port-forward.service.j2
|
||||||
|
dest: /etc/systemd/system/k8s-portforward-flux-ui.service
|
||||||
|
mode: "0644"
|
||||||
|
vars:
|
||||||
|
unit_description: Port-forward Flux UI for Tailscale access
|
||||||
|
unit_namespace: flux-system
|
||||||
|
unit_target: svc/flux-system-weave-gitops
|
||||||
|
unit_local_port: 19001
|
||||||
|
unit_remote_port: 9001
|
||||||
|
|
||||||
|
- name: Reload systemd
|
||||||
|
systemd:
|
||||||
|
daemon_reload: true
|
||||||
|
|
||||||
|
- name: Enable and start private access port-forward services
|
||||||
|
systemd:
|
||||||
|
name: "{{ item }}"
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
loop:
|
||||||
|
- k8s-portforward-grafana.service
|
||||||
|
- k8s-portforward-prometheus.service
|
||||||
|
- k8s-portforward-flux-ui.service
|
||||||
|
|
||||||
|
- name: Configure Tailscale Serve for private access endpoints
|
||||||
|
shell: >-
|
||||||
|
tailscale serve reset &&
|
||||||
|
tailscale serve --bg --tcp={{ private_access_grafana_port }} tcp://127.0.0.1:13080 &&
|
||||||
|
tailscale serve --bg --tcp={{ private_access_prometheus_port }} tcp://127.0.0.1:19090 &&
|
||||||
|
tailscale serve --bg --tcp={{ private_access_flux_port }} tcp://127.0.0.1:19001
|
||||||
|
changed_when: true
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
[Unit]
|
||||||
|
Description={{ unit_description }}
|
||||||
|
After=network-online.target k3s.service
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
ExecStart=/usr/local/bin/kubectl -n {{ unit_namespace }} port-forward --address 127.0.0.1 {{ unit_target }} {{ unit_local_port }}:{{ unit_remote_port }}
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -75,19 +75,46 @@
|
|||||||
roles:
|
roles:
|
||||||
- k3s-agent
|
- k3s-agent
|
||||||
|
|
||||||
- name: Deploy Hetzner CCM
|
- name: Bootstrap addon prerequisite secrets
|
||||||
hosts: control_plane[0]
|
hosts: control_plane[0]
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
roles:
|
roles:
|
||||||
- ccm
|
- addon-secrets-bootstrap
|
||||||
|
|
||||||
- name: Deploy Hetzner CSI
|
- name: Deploy observability stack
|
||||||
hosts: control_plane[0]
|
hosts: control_plane[0]
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
roles:
|
roles:
|
||||||
- csi
|
- role: observability
|
||||||
|
when: not (observability_gitops_enabled | default(true) | bool)
|
||||||
|
|
||||||
|
- name: Provision Grafana content
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: observability-content
|
||||||
|
when: not (observability_gitops_enabled | default(true) | bool)
|
||||||
|
|
||||||
|
- name: Configure private tailnet access
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
vars:
|
||||||
|
private_access_grafana_port: 30080
|
||||||
|
private_access_prometheus_port: 30990
|
||||||
|
private_access_flux_port: 30901
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- private-access
|
||||||
|
|
||||||
|
- name: Bootstrap Doppler access for External Secrets
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- doppler-bootstrap
|
||||||
|
|
||||||
- name: Finalize
|
- name: Finalize
|
||||||
hosts: localhost
|
hosts: localhost
|
||||||
|
|||||||
3
apps/kustomization.yaml
Normal file
3
apps/kustomization.yaml
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources: []
|
||||||
12
clusters/prod/flux-system/gitrepository-platform.yaml
Normal file
12
clusters/prod/flux-system/gitrepository-platform.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: GitRepository
|
||||||
|
metadata:
|
||||||
|
name: platform
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1m
|
||||||
|
ref:
|
||||||
|
branch: main
|
||||||
|
url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
|
||||||
|
secretRef:
|
||||||
|
name: flux-system
|
||||||
6426
clusters/prod/flux-system/gotk-components.yaml
Normal file
6426
clusters/prod/flux-system/gotk-components.yaml
Normal file
File diff suppressed because it is too large
Load Diff
43
clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
Normal file
43
clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: source-controller
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: kustomize-controller
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: helm-controller
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: notification-controller
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
17
clusters/prod/flux-system/kustomization-apps.yaml
Normal file
17
clusters/prod/flux-system/kustomization-apps.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: apps
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./apps
|
||||||
|
dependsOn:
|
||||||
|
- name: infrastructure
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: true
|
||||||
14
clusters/prod/flux-system/kustomization-infrastructure.yaml
Normal file
14
clusters/prod/flux-system/kustomization-infrastructure.yaml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: infrastructure
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure
|
||||||
|
wait: false
|
||||||
|
timeout: 5m
|
||||||
9
clusters/prod/flux-system/kustomization.yaml
Normal file
9
clusters/prod/flux-system/kustomization.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- gotk-components.yaml
|
||||||
|
- gitrepository-platform.yaml
|
||||||
|
- kustomization-infrastructure.yaml
|
||||||
|
- kustomization-apps.yaml
|
||||||
|
patchesStrategicMerge:
|
||||||
|
- gotk-controller-cp1-patches.yaml
|
||||||
4
clusters/prod/kustomization.yaml
Normal file
4
clusters/prod/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- flux-system
|
||||||
36
infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
Normal file
36
infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: hcloud-cloud-controller-manager
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: kube-system
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: hcloud-cloud-controller-manager
|
||||||
|
version: 1.30.1
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: hcloud
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
selectorLabels:
|
||||||
|
app: hcloud-cloud-controller-manager
|
||||||
|
args:
|
||||||
|
secure-port: "0"
|
||||||
|
networking:
|
||||||
|
enabled: true
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
additionalTolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
8
infrastructure/addons/ccm/helmrepository-hcloud.yaml
Normal file
8
infrastructure/addons/ccm/helmrepository-hcloud.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: hcloud
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://charts.hetzner.cloud
|
||||||
5
infrastructure/addons/ccm/kustomization.yaml
Normal file
5
infrastructure/addons/ccm/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- helmrepository-hcloud.yaml
|
||||||
|
- helmrelease-hcloud-ccm.yaml
|
||||||
36
infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
Normal file
36
infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: hcloud-csi
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: kube-system
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: hcloud-csi
|
||||||
|
version: 2.20.0
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: hcloud
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
controller:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
hcloudVolumeDefaultLocation: nbg1
|
||||||
|
storageClasses:
|
||||||
|
- name: hcloud-volumes
|
||||||
|
defaultStorageClass: true
|
||||||
|
reclaimPolicy: Delete
|
||||||
8
infrastructure/addons/csi/helmrepository-hcloud.yaml
Normal file
8
infrastructure/addons/csi/helmrepository-hcloud.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: hcloud
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://charts.hetzner.cloud
|
||||||
5
infrastructure/addons/csi/kustomization.yaml
Normal file
5
infrastructure/addons/csi/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- helmrepository-hcloud.yaml
|
||||||
|
- helmrelease-hcloud-csi.yaml
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
metadata:
|
||||||
|
name: doppler-hetznerterra
|
||||||
|
spec:
|
||||||
|
provider:
|
||||||
|
doppler:
|
||||||
|
auth:
|
||||||
|
secretRef:
|
||||||
|
dopplerToken:
|
||||||
|
name: doppler-hetznerterra-service-token
|
||||||
|
key: dopplerToken
|
||||||
|
namespace: external-secrets
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: external-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: external-secrets
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: external-secrets
|
||||||
|
version: 2.1.0
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: external-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
installCRDs: true
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
webhook:
|
||||||
|
failurePolicy: Ignore
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
certController:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: false
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: external-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://charts.external-secrets.io
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- helmrepository-external-secrets.yaml
|
||||||
|
- helmrelease-external-secrets.yaml
|
||||||
4
infrastructure/addons/external-secrets/namespace.yaml
Normal file
4
infrastructure/addons/external-secrets/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: external-secrets
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: cluster-user-auth
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
refreshInterval: 1h
|
||||||
|
secretStoreRef:
|
||||||
|
name: doppler-hetznerterra
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: cluster-user-auth
|
||||||
|
creationPolicy: Owner
|
||||||
|
template:
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
username: "{{ .fluxAdminUsername }}"
|
||||||
|
password: "{{ .fluxAdminPasswordHash }}"
|
||||||
|
data:
|
||||||
|
- secretKey: fluxAdminUsername
|
||||||
|
remoteRef:
|
||||||
|
key: WEAVE_GITOPS_ADMIN_USERNAME
|
||||||
|
- secretKey: fluxAdminPasswordHash
|
||||||
|
remoteRef:
|
||||||
|
key: WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: GitRepository
|
||||||
|
metadata:
|
||||||
|
name: weave-gitops
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://github.com/weaveworks/weave-gitops
|
||||||
|
ref:
|
||||||
|
tag: v0.39.0-rc.2
|
||||||
35
infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
Normal file
35
infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: weave-gitops
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: flux-system
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: ./charts/gitops-server
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: weave-gitops
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
port: 9001
|
||||||
|
adminUser:
|
||||||
|
create: true
|
||||||
|
createClusterRole: true
|
||||||
|
createSecret: false
|
||||||
|
username: admin
|
||||||
|
rbac:
|
||||||
|
create: true
|
||||||
|
impersonationResourceNames:
|
||||||
|
- admin
|
||||||
19
infrastructure/addons/flux-ui/ingress-flux-ui.yaml
Normal file
19
infrastructure/addons/flux-ui/ingress-flux-ui.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: flux-ui
|
||||||
|
namespace: flux-system
|
||||||
|
annotations:
|
||||||
|
traefik.ingress.kubernetes.io/router.entrypoints: flux
|
||||||
|
spec:
|
||||||
|
ingressClassName: traefik
|
||||||
|
rules:
|
||||||
|
- http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: flux-system-weave-gitops
|
||||||
|
port:
|
||||||
|
number: 9001
|
||||||
8
infrastructure/addons/flux-ui/kustomization.yaml
Normal file
8
infrastructure/addons/flux-ui/kustomization.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- cluster-user-auth-externalsecret.yaml
|
||||||
|
- gitrepository-weave-gitops.yaml
|
||||||
|
- helmrelease-weave-gitops.yaml
|
||||||
|
- traefik-helmchartconfig-flux-entrypoint.yaml
|
||||||
|
- ingress-flux-ui.yaml
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: helm.cattle.io/v1
|
||||||
|
kind: HelmChartConfig
|
||||||
|
metadata:
|
||||||
|
name: traefik
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
valuesContent: |-
|
||||||
|
additionalArguments:
|
||||||
|
- "--entryPoints.flux.address=:9001/tcp"
|
||||||
15
infrastructure/addons/kustomization-ccm.yaml
Normal file
15
infrastructure/addons/kustomization-ccm.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-ccm
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/ccm
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: true
|
||||||
17
infrastructure/addons/kustomization-csi.yaml
Normal file
17
infrastructure/addons/kustomization-csi.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-csi
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/csi
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-ccm
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: true
|
||||||
15
infrastructure/addons/kustomization-external-secrets.yaml
Normal file
15
infrastructure/addons/kustomization-external-secrets.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-external-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/external-secrets
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
17
infrastructure/addons/kustomization-flux-ui.yaml
Normal file
17
infrastructure/addons/kustomization-flux-ui.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-flux-ui
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/flux-ui
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-external-secrets
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-observability-content
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/observability-content
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-observability
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
17
infrastructure/addons/kustomization-observability.yaml
Normal file
17
infrastructure/addons/kustomization-observability.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-observability
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/observability
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-external-secrets
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
15
infrastructure/addons/kustomization-tailscale-operator.yaml
Normal file
15
infrastructure/addons/kustomization-tailscale-operator.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-tailscale-operator
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/tailscale-operator
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: true
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-tailscale-proxyclass
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/tailscale-proxyclass
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-tailscale-operator
|
||||||
|
wait: true
|
||||||
|
timeout: 5m
|
||||||
|
suspend: true
|
||||||
11
infrastructure/addons/kustomization.yaml
Normal file
11
infrastructure/addons/kustomization.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- kustomization-ccm.yaml
|
||||||
|
- kustomization-csi.yaml
|
||||||
|
- kustomization-external-secrets.yaml
|
||||||
|
- kustomization-flux-ui.yaml
|
||||||
|
- kustomization-tailscale-operator.yaml
|
||||||
|
- kustomization-tailscale-proxyclass.yaml
|
||||||
|
- kustomization-observability.yaml
|
||||||
|
- kustomization-observability-content.yaml
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-k8s-overview
|
||||||
|
namespace: observability
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
data:
|
||||||
|
k8s-overview.json: |
|
||||||
|
{
|
||||||
|
"annotations": {"list": []},
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
|
"id": 1,
|
||||||
|
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||||
|
"legendFormat": "ready",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"type": "stat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||||
|
"fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"id": 2,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
|
||||||
|
"legendFormat": "cpu",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Cluster CPU Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["kubernetes", "infrastructure"],
|
||||||
|
"templating": {"list": []},
|
||||||
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "K8s Cluster Overview",
|
||||||
|
"uid": "k8s-cluster-overview",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-datasources-core
|
||||||
|
namespace: observability
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
datasources.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: "http://loki.observability.svc.cluster.local:3100"
|
||||||
|
isDefault: false
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- grafana-datasources-core-configmap.yaml
|
||||||
|
- grafana-dashboard-k8s-overview-configmap.yaml
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: grafana-admin
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
refreshInterval: 1h
|
||||||
|
secretStoreRef:
|
||||||
|
name: doppler-hetznerterra
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: grafana-admin-credentials
|
||||||
|
creationPolicy: Owner
|
||||||
|
template:
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
admin-user: admin
|
||||||
|
admin-password: "{{ .grafanaAdminPassword }}"
|
||||||
|
data:
|
||||||
|
- secretKey: grafanaAdminPassword
|
||||||
|
remoteRef:
|
||||||
|
key: GRAFANA_ADMIN_PASSWORD
|
||||||
17
infrastructure/addons/observability/grafana-ingress.yaml
Normal file
17
infrastructure/addons/observability/grafana-ingress.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
ingressClassName: traefik
|
||||||
|
rules:
|
||||||
|
- http:
|
||||||
|
paths:
|
||||||
|
- path: /grafana
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: observability-kube-prometheus-stack-grafana
|
||||||
|
port:
|
||||||
|
number: 80
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: kube-prometheus-stack
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: observability
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: kube-prometheus-stack
|
||||||
|
version: 68.4.4
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: prometheus-community
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
grafana:
|
||||||
|
enabled: true
|
||||||
|
admin:
|
||||||
|
existingSecret: grafana-admin-credentials
|
||||||
|
userKey: admin-user
|
||||||
|
passwordKey: admin-password
|
||||||
|
grafana.ini:
|
||||||
|
server:
|
||||||
|
root_url: http://observability/grafana/
|
||||||
|
serve_from_sub_path: true
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClassName: local-path
|
||||||
|
size: 5Gi
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
sidecar:
|
||||||
|
datasources:
|
||||||
|
enabled: true
|
||||||
|
label: grafana_datasource
|
||||||
|
searchNamespace: observability
|
||||||
|
dashboards:
|
||||||
|
enabled: true
|
||||||
|
label: grafana_dashboard
|
||||||
|
searchNamespace: observability
|
||||||
|
prometheus:
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
prometheusSpec:
|
||||||
|
externalUrl: http://observability/prometheus/
|
||||||
|
routePrefix: /prometheus/
|
||||||
|
retention: 7d
|
||||||
|
storageSpec:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: local-path
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
alertmanager:
|
||||||
|
enabled: false
|
||||||
|
kubeEtcd:
|
||||||
|
enabled: false
|
||||||
|
kubeControllerManager:
|
||||||
|
enabled: false
|
||||||
|
kubeScheduler:
|
||||||
|
enabled: false
|
||||||
|
prometheus-node-exporter:
|
||||||
|
hostNetwork: false
|
||||||
|
service:
|
||||||
|
hostPort: false
|
||||||
99
infrastructure/addons/observability/helmrelease-loki.yaml
Normal file
99
infrastructure/addons/observability/helmrelease-loki.yaml
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: loki
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: observability
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: loki
|
||||||
|
version: 6.10.0
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: grafana
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
deploymentMode: SingleBinary
|
||||||
|
loki:
|
||||||
|
auth_enabled: false
|
||||||
|
commonConfig:
|
||||||
|
replication_factor: 1
|
||||||
|
schemaConfig:
|
||||||
|
configs:
|
||||||
|
- from: "2024-04-01"
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: loki_index_
|
||||||
|
period: 24h
|
||||||
|
storage:
|
||||||
|
type: filesystem
|
||||||
|
limits_config:
|
||||||
|
allow_structured_metadata: true
|
||||||
|
volume_enabled: true
|
||||||
|
retention_period: 168h
|
||||||
|
pattern_ingester:
|
||||||
|
enabled: true
|
||||||
|
ruler:
|
||||||
|
enable_api: true
|
||||||
|
singleBinary:
|
||||||
|
replicas: 1
|
||||||
|
persistence:
|
||||||
|
size: 10Gi
|
||||||
|
storageClass: local-path
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
backend:
|
||||||
|
replicas: 0
|
||||||
|
read:
|
||||||
|
replicas: 0
|
||||||
|
write:
|
||||||
|
replicas: 0
|
||||||
|
ingester:
|
||||||
|
replicas: 0
|
||||||
|
querier:
|
||||||
|
replicas: 0
|
||||||
|
queryFrontend:
|
||||||
|
replicas: 0
|
||||||
|
queryScheduler:
|
||||||
|
replicas: 0
|
||||||
|
distributor:
|
||||||
|
replicas: 0
|
||||||
|
compactor:
|
||||||
|
replicas: 0
|
||||||
|
indexGateway:
|
||||||
|
replicas: 0
|
||||||
|
bloomCompactor:
|
||||||
|
replicas: 0
|
||||||
|
bloomGateway:
|
||||||
|
replicas: 0
|
||||||
|
gateway:
|
||||||
|
enabled: false
|
||||||
|
test:
|
||||||
|
enabled: false
|
||||||
|
chunksCache:
|
||||||
|
enabled: true
|
||||||
|
allocatedMemory: 128
|
||||||
|
resultsCache:
|
||||||
|
enabled: true
|
||||||
|
allocatedMemory: 128
|
||||||
|
monitoring:
|
||||||
|
selfMonitoring:
|
||||||
|
enabled: false
|
||||||
|
lokiCanary:
|
||||||
|
enabled: false
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: promtail
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: observability
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: promtail
|
||||||
|
version: 6.16.6
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: grafana
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
config:
|
||||||
|
clients:
|
||||||
|
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://grafana.github.io/helm-charts
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: prometheus-community
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://prometheus-community.github.io/helm-charts
|
||||||
13
infrastructure/addons/observability/kustomization.yaml
Normal file
13
infrastructure/addons/observability/kustomization.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- grafana-admin-externalsecret.yaml
|
||||||
|
- traefik-tailscale-service.yaml
|
||||||
|
- grafana-ingress.yaml
|
||||||
|
- prometheus-ingress.yaml
|
||||||
|
- helmrepository-prometheus-community.yaml
|
||||||
|
- helmrepository-grafana.yaml
|
||||||
|
- helmrelease-kube-prometheus-stack.yaml
|
||||||
|
- helmrelease-loki.yaml
|
||||||
|
- helmrelease-promtail.yaml
|
||||||
4
infrastructure/addons/observability/namespace.yaml
Normal file
4
infrastructure/addons/observability/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: observability
|
||||||
17
infrastructure/addons/observability/prometheus-ingress.yaml
Normal file
17
infrastructure/addons/observability/prometheus-ingress.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: prometheus
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
ingressClassName: traefik
|
||||||
|
rules:
|
||||||
|
- http:
|
||||||
|
paths:
|
||||||
|
- path: /prometheus
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: observability-kube-prometh-prometheus
|
||||||
|
port:
|
||||||
|
number: 9090
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: traefik-tailscale
|
||||||
|
namespace: kube-system
|
||||||
|
annotations:
|
||||||
|
tailscale.com/hostname: observability
|
||||||
|
tailscale.com/proxy-class: infra-stable
|
||||||
|
spec:
|
||||||
|
type: LoadBalancer
|
||||||
|
loadBalancerClass: tailscale
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/instance: traefik-kube-system
|
||||||
|
app.kubernetes.io/name: traefik
|
||||||
|
ports:
|
||||||
|
- name: web
|
||||||
|
port: 80
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: web
|
||||||
|
- name: websecure
|
||||||
|
port: 443
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: websecure
|
||||||
|
- name: flux
|
||||||
|
port: 9001
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 9001
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: tailscale-operator
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
targetNamespace: tailscale-system
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: tailscale-operator
|
||||||
|
version: 1.95.91
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: tailscale
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
createNamespace: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
values:
|
||||||
|
installCRDs: true
|
||||||
|
apiServerProxyConfig:
|
||||||
|
mode: "true"
|
||||||
|
operatorConfig:
|
||||||
|
defaultTags:
|
||||||
|
- tag:k8s
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
proxyConfig:
|
||||||
|
defaultTags: tag:k8s
|
||||||
|
defaultProxyClass: infra-stable
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: tailscale
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://pkgs.tailscale.com/unstable/helmcharts
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- helmrepository-tailscale.yaml
|
||||||
|
- helmrelease-tailscale-operator.yaml
|
||||||
4
infrastructure/addons/tailscale-operator/namespace.yaml
Normal file
4
infrastructure/addons/tailscale-operator/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: tailscale-system
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- proxyclass-infra-stable.yaml
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: tailscale.com/v1alpha1
|
||||||
|
kind: ProxyClass
|
||||||
|
metadata:
|
||||||
|
name: infra-stable
|
||||||
|
spec:
|
||||||
|
statefulSet:
|
||||||
|
pod:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
4
infrastructure/kustomization.yaml
Normal file
4
infrastructure/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- addons
|
||||||
@@ -25,7 +25,7 @@ variable "cluster_name" {
|
|||||||
variable "control_plane_count" {
|
variable "control_plane_count" {
|
||||||
description = "Number of control plane nodes"
|
description = "Number of control plane nodes"
|
||||||
type = number
|
type = number
|
||||||
default = 3
|
default = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "control_plane_type" {
|
variable "control_plane_type" {
|
||||||
@@ -37,7 +37,7 @@ variable "control_plane_type" {
|
|||||||
variable "worker_count" {
|
variable "worker_count" {
|
||||||
description = "Number of worker nodes"
|
description = "Number of worker nodes"
|
||||||
type = number
|
type = number
|
||||||
default = 4
|
default = 2
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_type" {
|
variable "worker_type" {
|
||||||
|
|||||||
Reference in New Issue
Block a user