fix: rely on k3s service readiness instead of installer exit code
Deploy Cluster / Terraform (push) Successful in 27s
Deploy Cluster / Ansible (push) Failing after 8m9s

The k3s install script can return non-zero while systemd is still bringing the
service up, especially on worker agents. Do not fail immediately on the
installer command; wait for the service to become active and only emit
install diagnostics if the later readiness check fails.
This commit is contained in:
2026-04-22 04:14:31 +00:00
parent b3e88712bd
commit d1c31cdb91
2 changed files with 35 additions and 50 deletions
+33 -22
View File
@@ -24,30 +24,10 @@
--node-ip {{ k3s_node_ip }}
--flannel-iface={{ k3s_flannel_iface }}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: k3s_agent_install
failed_when: false
args:
creates: /usr/local/bin/k3s-agent
rescue:
- name: Show k3s-agent service status after failed install
command: systemctl status k3s-agent --no-pager
register: k3s_agent_status_after_install
changed_when: false
failed_when: false
- name: Show recent k3s-agent logs after failed install
command: journalctl -u k3s-agent -n 120 --no-pager
register: k3s_agent_journal_after_install
changed_when: false
failed_when: false
- name: Fail with k3s-agent diagnostics
fail:
msg: |
k3s agent install failed on {{ inventory_hostname }}.
Service status:
{{ k3s_agent_status_after_install.stdout | default('n/a') }}
Recent logs:
{{ k3s_agent_journal_after_install.stdout | default('n/a') }}
- name: Wait for k3s agent to be ready
command: systemctl is-active k3s-agent
@@ -56,3 +36,34 @@
retries: 30
delay: 10
changed_when: false
- name: Show k3s-agent service status on failure
command: systemctl status k3s-agent --no-pager
register: k3s_agent_status
changed_when: false
failed_when: false
when: agent_status is failed
- name: Show recent k3s-agent logs on failure
command: journalctl -u k3s-agent -n 120 --no-pager
register: k3s_agent_journal
changed_when: false
failed_when: false
when: agent_status is failed
- name: Fail with k3s-agent diagnostics
fail:
msg: |
k3s agent failed to become ready on {{ inventory_hostname }}.
Install stdout:
{{ k3s_agent_install.stdout | default('n/a') }}
Install stderr:
{{ k3s_agent_install.stderr | default('n/a') }}
Service status:
{{ k3s_agent_status.stdout | default('n/a') }}
Recent logs:
{{ k3s_agent_journal.stdout | default('n/a') }}
when: agent_status is failed
+2 -28
View File
@@ -69,6 +69,8 @@
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: primary_install
failed_when: false
when:
- k3s_install_needed
- k3s_primary | default(false)
@@ -92,36 +94,8 @@
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
register: secondary_install
rescue:
- name: Show k3s service status after failed secondary install
command: systemctl status k3s --no-pager
register: k3s_status_after_install
changed_when: false
failed_when: false
- name: Show recent k3s logs after failed secondary install
command: journalctl -u k3s -n 120 --no-pager
register: k3s_journal_after_install
changed_when: false
failed_when: false
- name: Fail with secondary install diagnostics
fail:
msg: |
Secondary k3s install failed on {{ inventory_hostname }}.
Install stdout:
{{ secondary_install.stdout | default('n/a') }}
Install stderr:
{{ secondary_install.stderr | default('n/a') }}
Service status:
{{ k3s_status_after_install.stdout | default('n/a') }}
Recent logs:
{{ k3s_journal_after_install.stdout | default('n/a') }}
- name: Wait for k3s to be ready
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
register: k3s_ready