fix: harden cluster rebuild determinism
Deploy Grafana Content / Grafana Content (push) Failing after 1m14s
Deploy Cluster / Terraform (push) Failing after 4m59s
Deploy Cluster / Ansible (push) Has been skipped

This commit is contained in:
2026-04-30 07:36:27 +00:00
parent f52e657f9f
commit a33a993867
38 changed files with 865 additions and 289 deletions
@@ -21,14 +21,3 @@
register: bootstrap_image_pull
loop: "{{ bootstrap_prepull_images }}"
changed_when: "'pulled image' in bootstrap_image_pull.stdout"
failed_when: false
- name: Report bootstrap images that did not pre-pull after retries
debug:
msg: >-
Best-effort bootstrap image pre-pull did not complete for {{ item.item }} after
3 attempt(s): {{ item.stderr | default('no stderr') }}
loop: "{{ bootstrap_image_pull.results | default([]) }}"
loop_control:
label: "{{ item.item }}"
when: item.rc is defined and item.rc != 0
+9
View File
@@ -95,6 +95,10 @@
- name: Install tailscale
shell: curl -fsSL https://tailscale.com/install.sh | sh
register: tailscale_install
until: tailscale_install.rc == 0
retries: 5
delay: 15
when:
- tailscale_auth_key | length > 0
- tailscale_binary.rc != 0
@@ -117,6 +121,11 @@
- name: Connect node to tailnet
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
register: tailscale_up
until: tailscale_up.rc == 0
retries: 5
delay: 15
no_log: true
when:
- tailscale_auth_key | length > 0
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
+14 -1
View File
@@ -32,11 +32,22 @@
url: https://get.k3s.io
dest: /tmp/install-k3s.sh
mode: "0755"
register: k3s_agent_install_script
until: k3s_agent_install_script is succeeded
retries: 5
delay: 10
when: k3s_agent_install_needed
- name: Install k3s agent
when: k3s_agent_install_needed
block:
- name: Wait for Kubernetes API endpoint before agent join
wait_for:
host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
port: 6443
state: started
timeout: 180
- name: Run k3s agent install
environment:
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
@@ -48,7 +59,9 @@
--flannel-iface={{ k3s_flannel_iface }}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: k3s_agent_install
failed_when: false
until: k3s_agent_install.rc == 0
retries: 3
delay: 20
- name: Wait for k3s agent to be ready
command: systemctl is-active k3s-agent
+10 -2
View File
@@ -62,6 +62,10 @@
url: https://get.k3s.io
dest: /tmp/install-k3s.sh
mode: "0755"
register: k3s_install_script
until: k3s_install_script is succeeded
retries: 5
delay: 10
when: k3s_install_needed
- name: Install k3s server (primary)
@@ -82,7 +86,9 @@
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: primary_install
failed_when: false
until: primary_install.rc == 0
retries: 3
delay: 20
when:
- k3s_install_needed
- k3s_primary | default(false)
@@ -106,7 +112,9 @@
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: secondary_install
failed_when: false
until: secondary_install.rc == 0
retries: 3
delay: 20
- name: Wait for k3s to be ready
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
+4 -11
View File
@@ -40,17 +40,6 @@
register: kube_vip_image_pull
loop: "{{ kube_vip_prepull_images }}"
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
failed_when: false
- name: Report kube-vip images that did not pre-pull after retries
debug:
msg: >-
Best-effort kube-vip image pre-pull did not complete for {{ item.item }} after
3 attempt(s): {{ item.stderr | default('no stderr') }}
loop: "{{ kube_vip_image_pull.results | default([]) }}"
loop_control:
label: "{{ item.item }}"
when: item.rc is defined and item.rc != 0
- name: Render kube-vip control plane manifest
template:
@@ -60,6 +49,10 @@
- name: Apply kube-vip control plane manifest
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
register: kube_vip_apply
until: kube_vip_apply.rc == 0
retries: 3
delay: 10
changed_when: true
- name: Wait for local kube-vip pod to be ready
@@ -105,6 +105,11 @@
register: grafana_loki_labels
changed_when: false
failed_when: false
until: >-
grafana_loki_labels.rc != 0 or
'"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
retries: 30
delay: 10
when: loki_enabled
- name: Fail when Loki is reachable but has zero indexed labels
@@ -21,14 +21,3 @@
register: rancher_image_pull
loop: "{{ rancher_images_to_prepull }}"
changed_when: "'pulled image' in rancher_image_pull.stdout"
failed_when: false
- name: Report Rancher images that did not pre-pull after retries
debug:
msg: >-
Best-effort Rancher image pre-pull did not complete for {{ item.item }} after
3 attempt(s): {{ item.stderr | default('no stderr') }}
loop: "{{ rancher_image_pull.results | default([]) }}"
loop_control:
label: "{{ item.item }}"
when: item.rc is defined and item.rc != 0
@@ -9,6 +9,9 @@
Authorization: "Bearer {{ tailscale_api_key }}"
return_content: true
register: ts_devices
until: ts_devices.status == 200
retries: 5
delay: 10
- name: Find stale devices matching reserved hostnames
set_fact:
@@ -34,6 +37,10 @@
headers:
Authorization: "Bearer {{ tailscale_api_key }}"
status_code: 200
register: ts_delete_device
until: ts_delete_device.status == 200
retries: 3
delay: 5
loop: "{{ stale_devices }}"
loop_control:
label: "{{ item.name }} ({{ item.id }})"
+37
View File
@@ -76,6 +76,18 @@
roles:
- kube-vip-deploy
- name: Wait for Kubernetes API VIP readiness
hosts: control_plane[0]
become: true
tasks:
- name: Wait for Kubernetes readyz through the VIP
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
register: api_readyz
until: api_readyz.rc == 0
retries: 30
delay: 10
changed_when: false
- name: Setup secondary control planes
hosts: control_plane[1:]
become: true
@@ -123,6 +135,31 @@
- name: Import kube-vip image into containerd
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
register: kube_vip_secondary_import
until: kube_vip_secondary_import.rc == 0
retries: 3
delay: 10
changed_when: false
- name: Wait for all control plane nodes to be Ready
hosts: control_plane[0]
become: true
tasks:
- name: Wait for control plane node readiness
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
register: control_plane_ready
until: control_plane_ready.rc == 0
retries: 20
delay: 15
changed_when: false
loop: "{{ groups['control_plane'] }}"
- name: Wait for Kubernetes readyz before worker joins
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
register: api_readyz_before_workers
until: api_readyz_before_workers.rc == 0
retries: 30
delay: 10
changed_when: false
- name: Setup workers