fix: surface CSI rollout diagnostics before failing
Some checks failed
Deploy Cluster / Terraform (push) Successful in 43s
Deploy Cluster / Ansible (push) Failing after 8m23s

This commit is contained in:
2026-03-01 18:42:03 +00:00
parent 6593adbea6
commit 4cefd7df40

View File

@@ -19,6 +19,7 @@
until: csi_controller_rollout.rc == 0
retries: "{{ csi_rollout_retries | int }}"
delay: "{{ csi_rollout_delay_seconds | int }}"
failed_when: false
changed_when: false
- name: Show CSI controller status on failure
@@ -26,28 +27,35 @@
register: csi_controller_deploy_status
changed_when: false
failed_when: false
when: csi_controller_rollout is failed
when: csi_controller_rollout.rc != 0
- name: Show CSI controller pods on failure
command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
register: csi_controller_pods_status
changed_when: false
failed_when: false
when: csi_controller_rollout is failed
when: csi_controller_rollout.rc != 0
- name: Describe CSI controller deployment on failure
command: kubectl -n kube-system describe deployment hcloud-csi-controller
register: csi_controller_deploy_describe
changed_when: false
failed_when: false
when: csi_controller_rollout.rc != 0
- name: Show CSI driver logs on failure
command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
register: csi_driver_logs
changed_when: false
failed_when: false
when: csi_controller_rollout is failed
when: csi_controller_rollout.rc != 0
- name: Show recent kube-system events on failure
command: kubectl -n kube-system get events --sort-by=.lastTimestamp
register: csi_recent_events
changed_when: false
failed_when: false
when: csi_controller_rollout is failed
when: csi_controller_rollout.rc != 0
- name: Fail with CSI controller diagnostics
fail:
@@ -59,12 +67,15 @@
Pods status:
{{ csi_controller_pods_status.stdout | default('n/a') }}
Deployment describe:
{{ csi_controller_deploy_describe.stdout | default('n/a') }}
hcloud-csi-driver logs:
{{ csi_driver_logs.stdout | default('n/a') }}
Recent kube-system events:
{{ csi_recent_events.stdout | default('n/a') }}
when: csi_controller_rollout is failed
when: csi_controller_rollout.rc != 0
- name: Wait for CSI node daemonset rollout
command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
@@ -72,4 +83,10 @@
until: csi_node_rollout.rc == 0
retries: "{{ csi_rollout_retries | int }}"
delay: "{{ csi_rollout_delay_seconds | int }}"
failed_when: false
changed_when: false
- name: Fail when CSI node daemonset rollout does not complete
fail:
msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
when: csi_node_rollout.rc != 0