diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 53e59d0..5b7f1be 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -88,12 +88,8 @@ jobs: } ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1' - ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2' - ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3' ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1' ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2' - ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3' - ensure_import 'hcloud_server.workers[3]' 'k8s-cluster-worker-4' - name: Terraform Plan id: plan diff --git a/README.md b/README.md index 13748b2..3dc23aa 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,20 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → ** This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap. +### Stable private-only baseline + +The current default target is a deliberately simplified baseline: + +- `1` control plane node +- `2` worker nodes +- private Hetzner network only +- Tailscale for operator access +- Flux-managed core addons only + +Detailed phase gates and success criteria live in `STABLE_BASELINE.md`. + +This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later. + ### Runtime secrets Runtime cluster secrets are moving to Doppler + External Secrets Operator. @@ -222,6 +236,20 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed - Core infrastructure addons are Flux-managed from `infrastructure/addons/`. - Active Flux addons include `addon-ccm`, `addon-csi`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`, `addon-observability`, and `addon-observability-content`. - Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons. +- `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success. + +### Stable baseline acceptance + +A rebuild is considered successful only when all of the following pass without manual intervention: + +- Terraform create succeeds for the default `1` control plane and `2` workers. +- Ansible bootstrap succeeds end-to-end. +- All nodes become `Ready`. +- `hcloud-cloud-controller-manager` and `hcloud-csi` are `Ready`. +- Required External Secrets sync successfully. +- Tailscale private access works. +- Grafana and Prometheus are reachable privately. +- Terraform destroy succeeds cleanly or succeeds after workflow retries. ## Observability Stack diff --git a/STABLE_BASELINE.md b/STABLE_BASELINE.md new file mode 100644 index 0000000..484a485 --- /dev/null +++ b/STABLE_BASELINE.md @@ -0,0 +1,47 @@ +# Stable Private-Only Baseline + +This document defines the current engineering target for this repository. + +## Topology + +- 1 control plane +- 2 workers +- private Hetzner network +- Tailscale operator access + +## In Scope + +- Terraform infrastructure bootstrap +- Ansible k3s bootstrap +- Flux core reconciliation +- Hetzner CCM +- Hetzner CSI +- External Secrets Operator with Doppler +- Tailscale private access +- Observability stack + +## Out of Scope + +- HA control plane +- public ingress or DNS +- public TLS +- app workloads +- DR / backup strategy +- upgrade strategy + +## Phase Gates + +1. Terraform apply completes for the default topology. +2. k3s server bootstrap completes and kubeconfig works. +3. Workers join and all nodes are Ready. +4. Flux source and infrastructure reconciliation are healthy. +5. CCM is Ready. +6. CSI is Ready and a PVC can bind. +7. External Secrets sync required secrets. +8. Tailscale private access works. +9. Observability is healthy and reachable privately. +10. Terraform destroy succeeds cleanly or via workflow retry. + +## Success Criteria + +The baseline is considered stable only after two consecutive fresh rebuilds pass all phase gates with no manual fixes. diff --git a/terraform/variables.tf b/terraform/variables.tf index d21f797..abfc326 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -25,7 +25,7 @@ variable "cluster_name" { variable "control_plane_count" { description = "Number of control plane nodes" type = number - default = 3 + default = 1 } variable "control_plane_type" { @@ -37,7 +37,7 @@ variable "control_plane_type" { variable "worker_count" { description = "Number of worker nodes" type = number - default = 4 + default = 2 } variable "worker_type" {