fix: harden cluster rebuild determinism
This commit is contained in:
@@ -21,14 +21,3 @@
|
||||
register: bootstrap_image_pull
|
||||
loop: "{{ bootstrap_prepull_images }}"
|
||||
changed_when: "'pulled image' in bootstrap_image_pull.stdout"
|
||||
failed_when: false
|
||||
|
||||
- name: Report bootstrap images that did not pre-pull after retries
|
||||
debug:
|
||||
msg: >-
|
||||
Best-effort bootstrap image pre-pull did not complete for {{ item.item }} after
|
||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
||||
loop: "{{ bootstrap_image_pull.results | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.rc is defined and item.rc != 0
|
||||
|
||||
@@ -95,6 +95,10 @@
|
||||
|
||||
- name: Install tailscale
|
||||
shell: curl -fsSL https://tailscale.com/install.sh | sh
|
||||
register: tailscale_install
|
||||
until: tailscale_install.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_binary.rc != 0
|
||||
@@ -117,6 +121,11 @@
|
||||
|
||||
- name: Connect node to tailnet
|
||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||
register: tailscale_up
|
||||
until: tailscale_up.rc == 0
|
||||
retries: 5
|
||||
delay: 15
|
||||
no_log: true
|
||||
when:
|
||||
- tailscale_auth_key | length > 0
|
||||
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
|
||||
|
||||
@@ -32,11 +32,22 @@
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
register: k3s_agent_install_script
|
||||
until: k3s_agent_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_agent_install_needed
|
||||
|
||||
- name: Install k3s agent
|
||||
when: k3s_agent_install_needed
|
||||
block:
|
||||
- name: Wait for Kubernetes API endpoint before agent join
|
||||
wait_for:
|
||||
host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
|
||||
port: 6443
|
||||
state: started
|
||||
timeout: 180
|
||||
|
||||
- name: Run k3s agent install
|
||||
environment:
|
||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||
@@ -48,7 +59,9 @@
|
||||
--flannel-iface={{ k3s_flannel_iface }}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: k3s_agent_install
|
||||
failed_when: false
|
||||
until: k3s_agent_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s agent to be ready
|
||||
command: systemctl is-active k3s-agent
|
||||
|
||||
@@ -62,6 +62,10 @@
|
||||
url: https://get.k3s.io
|
||||
dest: /tmp/install-k3s.sh
|
||||
mode: "0755"
|
||||
register: k3s_install_script
|
||||
until: k3s_install_script is succeeded
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: k3s_install_needed
|
||||
|
||||
- name: Install k3s server (primary)
|
||||
@@ -82,7 +86,9 @@
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: primary_install
|
||||
failed_when: false
|
||||
until: primary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
when:
|
||||
- k3s_install_needed
|
||||
- k3s_primary | default(false)
|
||||
@@ -106,7 +112,9 @@
|
||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||
register: secondary_install
|
||||
failed_when: false
|
||||
until: secondary_install.rc == 0
|
||||
retries: 3
|
||||
delay: 20
|
||||
|
||||
- name: Wait for k3s to be ready
|
||||
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
|
||||
|
||||
@@ -40,17 +40,6 @@
|
||||
register: kube_vip_image_pull
|
||||
loop: "{{ kube_vip_prepull_images }}"
|
||||
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
|
||||
failed_when: false
|
||||
|
||||
- name: Report kube-vip images that did not pre-pull after retries
|
||||
debug:
|
||||
msg: >-
|
||||
Best-effort kube-vip image pre-pull did not complete for {{ item.item }} after
|
||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
||||
loop: "{{ kube_vip_image_pull.results | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.rc is defined and item.rc != 0
|
||||
|
||||
- name: Render kube-vip control plane manifest
|
||||
template:
|
||||
@@ -60,6 +49,10 @@
|
||||
|
||||
- name: Apply kube-vip control plane manifest
|
||||
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
|
||||
register: kube_vip_apply
|
||||
until: kube_vip_apply.rc == 0
|
||||
retries: 3
|
||||
delay: 10
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for local kube-vip pod to be ready
|
||||
|
||||
@@ -105,6 +105,11 @@
|
||||
register: grafana_loki_labels
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
until: >-
|
||||
grafana_loki_labels.rc != 0 or
|
||||
'"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: loki_enabled
|
||||
|
||||
- name: Fail when Loki is reachable but has zero indexed labels
|
||||
|
||||
@@ -21,14 +21,3 @@
|
||||
register: rancher_image_pull
|
||||
loop: "{{ rancher_images_to_prepull }}"
|
||||
changed_when: "'pulled image' in rancher_image_pull.stdout"
|
||||
failed_when: false
|
||||
|
||||
- name: Report Rancher images that did not pre-pull after retries
|
||||
debug:
|
||||
msg: >-
|
||||
Best-effort Rancher image pre-pull did not complete for {{ item.item }} after
|
||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
||||
loop: "{{ rancher_image_pull.results | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
when: item.rc is defined and item.rc != 0
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
return_content: true
|
||||
register: ts_devices
|
||||
until: ts_devices.status == 200
|
||||
retries: 5
|
||||
delay: 10
|
||||
|
||||
- name: Find stale devices matching reserved hostnames
|
||||
set_fact:
|
||||
@@ -34,6 +37,10 @@
|
||||
headers:
|
||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||
status_code: 200
|
||||
register: ts_delete_device
|
||||
until: ts_delete_device.status == 200
|
||||
retries: 3
|
||||
delay: 5
|
||||
loop: "{{ stale_devices }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }} ({{ item.id }})"
|
||||
|
||||
Reference in New Issue
Block a user