diff --git a/ansible/roles/ccm/defaults/main.yml b/ansible/roles/ccm/defaults/main.yml index af33d94..26d72ce 100644 --- a/ansible/roles/ccm/defaults/main.yml +++ b/ansible/roles/ccm/defaults/main.yml @@ -1,3 +1,4 @@ --- hcloud_token: "" cluster_name: "k8s-cluster" +hcloud_lb_location: "nbg1" diff --git a/ansible/roles/ccm/tasks/main.yml b/ansible/roles/ccm/tasks/main.yml index 49f10a7..2fcb60f 100644 --- a/ansible/roles/ccm/tasks/main.yml +++ b/ansible/roles/ccm/tasks/main.yml @@ -49,6 +49,30 @@ delay: 10 when: ccm_workload_kind.stdout == "daemonset" +- name: Set default Hetzner load balancer location for Traefik service + command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite + register: traefik_annotation + changed_when: true + failed_when: false + +- name: Show Traefik service when annotation patch fails + command: kubectl -n kube-system get service traefik -o yaml + register: traefik_service_dump + changed_when: false + failed_when: false + when: traefik_annotation.rc != 0 + +- name: Fail when Traefik load balancer annotation cannot be set + fail: + msg: | + Failed to set Hetzner load balancer location annotation on kube-system/traefik service. + Command output: + {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }} + + Service dump: + {{ traefik_service_dump.stdout | default('n/a') }} + when: traefik_annotation.rc != 0 + - name: Show CCM namespace objects when workload missing command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true register: ccm_ns_objects diff --git a/ansible/roles/csi/defaults/main.yml b/ansible/roles/csi/defaults/main.yml index 77fa931..740fc19 100644 --- a/ansible/roles/csi/defaults/main.yml +++ b/ansible/roles/csi/defaults/main.yml @@ -6,3 +6,6 @@ csi_rollout_timeout_seconds: 30 csi_rollout_retries: 8 csi_rollout_delay_seconds: 5 csi_failure_log_tail_lines: 120 +csi_smoke_test_enabled: true +csi_smoke_test_storage_class: "hcloud-volumes" +csi_smoke_test_size: "1Gi" diff --git a/ansible/roles/csi/tasks/main.yml b/ansible/roles/csi/tasks/main.yml index 6faa437..c360d4e 100644 --- a/ansible/roles/csi/tasks/main.yml +++ b/ansible/roles/csi/tasks/main.yml @@ -70,6 +70,31 @@ failed_when: false when: csi_controller_rollout.rc != 0 +- name: Show CSI driver previous logs on failure + shell: | + pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" + if [ -n "$pod" ]; then + kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }} + fi + register: csi_driver_previous_logs + changed_when: false + failed_when: false + when: csi_controller_rollout.rc != 0 + +- name: Show sidecar previous logs on failure + shell: | + pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" + if [ -n "$pod" ]; then + for container in csi-attacher csi-resizer csi-provisioner; do + echo "===== $container =====" + kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true + done + fi + register: csi_sidecar_previous_logs + changed_when: false + failed_when: false + when: csi_controller_rollout.rc != 0 + - name: Show recent kube-system events on failure command: kubectl -n kube-system get events --sort-by=.lastTimestamp register: csi_recent_events @@ -119,27 +144,117 @@ fail: msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}" when: csi_node_rollout.rc != 0 -- name: Show CSI driver previous logs on failure - shell: | - pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }} - fi - register: csi_driver_previous_logs - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 -- name: Show sidecar previous logs on failure +- name: Apply CSI smoke test resources shell: | - pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - for container in csi-attacher csi-resizer csi-provisioner; do - echo "===== $container =====" - kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true - done - fi - register: csi_sidecar_previous_logs - changed_when: false + kubectl apply -f - <<'EOF' + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: csi-smoke-pvc + namespace: kube-system + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ csi_smoke_test_size }} + storageClassName: {{ csi_smoke_test_storage_class }} + --- + apiVersion: batch/v1 + kind: Job + metadata: + name: csi-smoke-job + namespace: kube-system + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: write-and-read + image: busybox:1.36 + command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"] + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: csi-smoke-pvc + EOF + changed_when: true + when: csi_smoke_test_enabled | bool + +- name: Wait for CSI smoke PVC to bind + command: kubectl -n kube-system wait --for=jsonpath={.status.phase}=Bound pvc/csi-smoke-pvc --timeout=120s + register: csi_smoke_pvc_wait failed_when: false - when: csi_controller_rollout.rc != 0 + changed_when: false + when: csi_smoke_test_enabled | bool + +- name: Wait for CSI smoke Job completion + command: kubectl -n kube-system wait --for=condition=complete job/csi-smoke-job --timeout=180s + register: csi_smoke_job_wait + failed_when: false + changed_when: false + when: + - csi_smoke_test_enabled | bool + - csi_smoke_pvc_wait.rc == 0 + +- name: Show CSI smoke job logs + command: kubectl -n kube-system logs job/csi-smoke-job + register: csi_smoke_job_logs + failed_when: false + changed_when: false + when: csi_smoke_test_enabled | bool + +- name: Show CSI smoke resources on failure + command: kubectl -n kube-system get pvc csi-smoke-pvc job csi-smoke-job pod -l job-name=csi-smoke-job -o wide + register: csi_smoke_status + failed_when: false + changed_when: false + when: + - csi_smoke_test_enabled | bool + - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 + +- name: Show CSI smoke pod describe on failure + shell: | + pod="$(kubectl -n kube-system get pods -l job-name=csi-smoke-job -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" + if [ -n "$pod" ]; then + kubectl -n kube-system describe pod "$pod" + fi + register: csi_smoke_pod_describe + failed_when: false + changed_when: false + when: + - csi_smoke_test_enabled | bool + - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 + +- name: Fail when CSI smoke test fails + fail: + msg: | + CSI smoke test failed. + PVC wait: + {{ csi_smoke_pvc_wait.stdout | default(csi_smoke_pvc_wait.stderr) | default('n/a') }} + + Job wait: + {{ csi_smoke_job_wait.stdout | default(csi_smoke_job_wait.stderr | default('n/a')) }} + + Resources: + {{ csi_smoke_status.stdout | default('n/a') }} + + Pod describe: + {{ csi_smoke_pod_describe.stdout | default('n/a') }} + + Job logs: + {{ csi_smoke_job_logs.stdout | default('n/a') }} + when: + - csi_smoke_test_enabled | bool + - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 + +- name: Cleanup CSI smoke test resources + command: kubectl -n kube-system delete job csi-smoke-job pvc csi-smoke-pvc --ignore-not-found + failed_when: false + changed_when: false + when: csi_smoke_test_enabled | bool