Compare commits

1 Commits

Author SHA1 Message Date
MichaelFisher1997
f90075f098 terraform fmt
All checks were successful
Gitea Destroy Terraform / Terraform Destroy (push) Successful in 20s
2025-04-17 17:20:22 +01:00
34 changed files with 334 additions and 2878 deletions

View File

@@ -1,181 +0,0 @@
name: Kubeadm Bootstrap
run-name: ${{ gitea.actor }} requested kubeadm bootstrap
on:
workflow_dispatch:
inputs:
confirm:
description: "Type BOOTSTRAP to run rebuild + kubeadm bootstrap"
required: true
type: string
concurrency:
group: kubeadm-bootstrap
cancel-in-progress: false
jobs:
bootstrap:
name: "Rebuild and Bootstrap Cluster"
runs-on: ubuntu-latest
steps:
- name: Validate confirmation phrase
run: |
if [ "${{ inputs.confirm }}" != "BOOTSTRAP" ]; then
echo "Confirmation failed. You must type BOOTSTRAP."
exit 1
fi
- name: Checkout repository
uses: https://gitea.com/actions/checkout@v4
- name: Create SSH key
run: |
install -m 0700 -d ~/.ssh
KEY_SOURCE=""
KEY_CONTENT=""
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
if [ -n "$KEY_B64" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
if [ -n "$KEY_CONTENT" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
fi
fi
if [ -z "$KEY_CONTENT" ]; then
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
exit 1
fi
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
else
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
fi
chmod 0600 ~/.ssh/id_ed25519
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
echo "Invalid private key content from $KEY_SOURCE"
exit 1
fi
- name: Set up Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: 1.6.6
terraform_wrapper: false
- name: Build Terraform backend files
working-directory: terraform
run: |
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
- name: Terraform init for state read
working-directory: terraform
run: terraform init -reconfigure -backend-config=backend.hcl
- name: Create kubeadm inventory
env:
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
run: |
set -euo pipefail
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
- name: Validate nix installation
run: |
if [ -x /nix/var/nix/profiles/default/bin/nix ]; then
/nix/var/nix/profiles/default/bin/nix --version
exit 0
fi
if command -v nix >/dev/null 2>&1; then
nix --version
exit 0
fi
echo "Nix missing; installing no-daemon Nix for this runner job"
if [ "$(id -u)" -eq 0 ]; then
mkdir -p /nix
chown root:root /nix
chmod 0755 /nix
if ! getent group nixbld >/dev/null 2>&1; then
groupadd --system nixbld
fi
for i in $(seq 1 10); do
if ! id "nixbld$i" >/dev/null 2>&1; then
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
fi
usermod -a -G nixbld "nixbld$i"
done
fi
sh <(curl -L https://nixos.org/nix/install) --no-daemon
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
. /root/.nix-profile/etc/profile.d/nix.sh
fi
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
nix --version
- name: Install nixos-rebuild tool
env:
NIX_CONFIG: experimental-features = nix-command flakes
run: |
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
. /root/.nix-profile/etc/profile.d/nix.sh
fi
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
nix profile install nixpkgs#nixos-rebuild
- name: Run cluster rebuild and bootstrap
env:
NIX_CONFIG: experimental-features = nix-command flakes
FAST_MODE: "1"
WORKER_PARALLELISM: "3"
REBUILD_TIMEOUT: "45m"
REBUILD_RETRIES: "2"
run: |
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
. /root/.nix-profile/etc/profile.d/nix.sh
fi
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh

View File

@@ -1,112 +0,0 @@
name: Kubeadm Reset
run-name: ${{ gitea.actor }} requested kubeadm reset
on:
workflow_dispatch:
inputs:
confirm:
description: "Type RESET to run kubeadm reset on all nodes"
required: true
type: string
concurrency:
group: kubeadm-bootstrap
cancel-in-progress: false
jobs:
reset:
name: "Reset Cluster Nodes"
runs-on: ubuntu-latest
steps:
- name: Validate confirmation phrase
run: |
if [ "${{ inputs.confirm }}" != "RESET" ]; then
echo "Confirmation failed. You must type RESET."
exit 1
fi
- name: Checkout repository
uses: https://gitea.com/actions/checkout@v4
- name: Create SSH key
run: |
install -m 0700 -d ~/.ssh
KEY_SOURCE=""
KEY_CONTENT=""
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
if [ -n "$KEY_B64" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
if [ -n "$KEY_CONTENT" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
fi
fi
if [ -z "$KEY_CONTENT" ]; then
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
exit 1
fi
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
else
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
fi
chmod 0600 ~/.ssh/id_ed25519
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
echo "Invalid private key content from $KEY_SOURCE"
exit 1
fi
- name: Set up Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: 1.6.6
terraform_wrapper: false
- name: Build Terraform backend files
working-directory: terraform
run: |
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
- name: Terraform init for state read
working-directory: terraform
run: terraform init -reconfigure -backend-config=backend.hcl
- name: Create kubeadm inventory
env:
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
run: |
set -euo pipefail
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
- name: Run cluster reset
run: |
./nixos/kubeadm/scripts/reset-cluster-nodes.sh

View File

@@ -1,209 +1,47 @@
name: Terraform Apply name: Gitea Actions Demo
run-name: ${{ gitea.actor }} is deploying with Terraform 🚀
on: on:
push: push:
branches: branches:
- master - master
concurrency:
group: terraform-global
cancel-in-progress: false
jobs: jobs:
terraform: terraform:
name: "Terraform Apply" name: "Terraform Apply"
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
env:
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: https://gitea.com/actions/checkout@v4 uses: actions/checkout@v4
- name: Create secrets.tfvars
working-directory: terraform
run: |
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
- name: Set up Terraform - name: Set up Terraform
uses: hashicorp/setup-terraform@v2 uses: hashicorp/setup-terraform@v2
with: with:
terraform_version: 1.6.6 terraform_version: 1.6.6
terraform_wrapper: false
- name: Inject sensitive secrets
working-directory: terraform
run: |
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
- name: Terraform Init - name: Terraform Init
working-directory: terraform working-directory: terraform
run: terraform init -reconfigure -backend-config=backend.hcl run: terraform init
- name: Terraform Plan - name: Terraform Plan
working-directory: terraform working-directory: terraform
run: | run: terraform plan
set -euo pipefail
for attempt in 1 2; do
echo "Terraform plan attempt $attempt/2"
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
exit 0
fi
if [ "$attempt" -eq 1 ]; then
echo "Plan attempt failed or timed out; retrying in 20s"
sleep 20
fi
done
echo "Terraform plan failed after retries"
exit 1
- name: Block accidental destroy
env:
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }}
working-directory: terraform
run: |
terraform show -json -no-color tfplan > tfplan.json
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
echo "Planned deletes: $DESTROY_COUNT"
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
exit 1
fi
- name: Terraform Apply - name: Terraform Apply
working-directory: terraform working-directory: terraform
run: terraform apply -parallelism=1 -auto-approve tfplan run: terraform apply -auto-approve
- name: Create SSH key
run: |
install -m 0700 -d ~/.ssh
KEY_SOURCE=""
KEY_CONTENT=""
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
if [ -n "$KEY_B64" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
if [ -n "$KEY_CONTENT" ]; then
KEY_SOURCE="SSH_KEY_PRIVATE"
else
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
fi
fi
if [ -z "$KEY_CONTENT" ]; then
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
exit 1
fi
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
else
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
fi
chmod 0600 ~/.ssh/id_ed25519
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
echo "Invalid private key content from $KEY_SOURCE"
exit 1
fi
- name: Verify SSH keypair match
run: |
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/tmp/key.pub 2>/tmp/key.err; then
echo "Invalid private key content in SSH_KEY_PRIVATE/KUBEADM_SSH_PRIVATE_KEY"
cat /tmp/key.err
exit 1
fi
printf '%s\n' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r' > /tmp/secret.pub
if ! ssh-keygen -lf /tmp/secret.pub >/tmp/secret.fp 2>/tmp/secret.err; then
echo "Invalid SSH_KEY_PUBLIC format"
cat /tmp/secret.err
exit 1
fi
PRIV_FP="$(ssh-keygen -lf /tmp/key.pub | awk '{print $2}')"
PUB_FP="$(awk '{print $2}' /tmp/secret.fp)"
echo "private fingerprint: $PRIV_FP"
echo "public fingerprint: $PUB_FP"
if [ "$PRIV_FP" != "$PUB_FP" ]; then
echo "SSH_KEY_PRIVATE does not match SSH_KEY_PUBLIC. Update secrets with the same keypair."
exit 1
fi
- name: Create kubeadm inventory from Terraform outputs
env:
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
run: |
set -euo pipefail
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
- name: Ensure nix and nixos-rebuild
env:
NIX_CONFIG: experimental-features = nix-command flakes
run: |
if [ ! -x /nix/var/nix/profiles/default/bin/nix ] && ! command -v nix >/dev/null 2>&1; then
if [ "$(id -u)" -eq 0 ]; then
mkdir -p /nix
chown root:root /nix
chmod 0755 /nix
if ! getent group nixbld >/dev/null 2>&1; then
groupadd --system nixbld
fi
for i in $(seq 1 10); do
if ! id "nixbld$i" >/dev/null 2>&1; then
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
fi
usermod -a -G nixbld "nixbld$i"
done
fi
sh <(curl -L https://nixos.org/nix/install) --no-daemon
fi
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
. /root/.nix-profile/etc/profile.d/nix.sh
fi
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
nix --version
nix profile install nixpkgs#nixos-rebuild
- name: Rebuild and bootstrap/reconcile kubeadm cluster
env:
NIX_CONFIG: experimental-features = nix-command flakes
FAST_MODE: "1"
WORKER_PARALLELISM: "3"
REBUILD_TIMEOUT: "45m"
REBUILD_RETRIES: "2"
run: |
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
. /root/.nix-profile/etc/profile.d/nix.sh
fi
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh

View File

@@ -1,126 +1,43 @@
name: Terraform Destroy name: Gitea Destroy Terraform
run-name: ${{ gitea.actor }} requested Terraform destroy run-name: ${{ gitea.actor }} triggered a Terraform Destroy 🧨
on: on:
workflow_dispatch: push:
inputs: branches:
confirm: - destroy
description: "Type NUKE to confirm destroy"
required: true
type: string
target:
description: "Destroy scope"
required: true
default: all
type: choice
options:
- all
- control-planes
- workers
concurrency:
group: terraform-global
cancel-in-progress: false
jobs: jobs:
destroy: destroy:
name: "Terraform Destroy" name: "Terraform Destroy"
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
env:
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
steps: steps:
- name: Validate confirmation phrase
run: |
if [ "${{ inputs.confirm }}" != "NUKE" ]; then
echo "Confirmation failed. You must type NUKE."
exit 1
fi
- name: Checkout repository - name: Checkout repository
uses: https://gitea.com/actions/checkout@v4 uses: actions/checkout@v4
- name: Create Terraform secret files
working-directory: terraform
run: |
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
- name: Set up Terraform - name: Set up Terraform
uses: hashicorp/setup-terraform@v2 uses: hashicorp/setup-terraform@v2
with: with:
terraform_version: 1.6.6 terraform_version: 1.6.6
terraform_wrapper: false
- name: Inject sensitive secrets
working-directory: terraform
run: |
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
- name: Terraform Init - name: Terraform Init
working-directory: terraform working-directory: terraform
run: terraform init -reconfigure -backend-config=backend.hcl run: terraform init
- name: Terraform Destroy Plan - name: Terraform Destroy
working-directory: terraform working-directory: terraform
run: | run: terraform destroy -auto-approve
set -euo pipefail
case "${{ inputs.target }}" in
all)
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -out=tfdestroy"
;;
control-planes)
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy"
;;
workers)
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy"
;;
*)
echo "Invalid destroy target: ${{ inputs.target }}"
exit 1
;;
esac
for attempt in 1 2; do
echo "Terraform destroy plan attempt $attempt/2"
if timeout 20m sh -c "$TF_PLAN_CMD"; then
exit 0
fi
if [ "$attempt" -eq 1 ]; then
echo "Destroy plan attempt failed or timed out; retrying in 20s"
sleep 20
fi
done
echo "Terraform destroy plan failed after retries"
exit 1
- name: Terraform Destroy Apply
working-directory: terraform
run: |
set +e
terraform apply -auto-approve tfdestroy 2>&1 | tee destroy-apply.log
APPLY_EXIT=${PIPESTATUS[0]}
if [ "$APPLY_EXIT" -ne 0 ] && [ -f errored.tfstate ] && grep -q "Failed to persist state to backend" destroy-apply.log; then
echo "Detected backend state write failure after destroy; attempting recovery push..."
terraform state push errored.tfstate
PUSH_EXIT=$?
if [ "$PUSH_EXIT" -eq 0 ]; then
echo "Recovered by pushing errored.tfstate to backend."
exit 0
fi
fi
exit "$APPLY_EXIT"

View File

@@ -1,4 +1,5 @@
name: Terraform Plan name: Gitea Actions Demo
run-name: ${{ gitea.actor }} is testing out Gitea Actions 🚀
on: on:
push: push:
@@ -6,56 +7,38 @@ on:
- stage - stage
- test - test
concurrency:
group: terraform-global
cancel-in-progress: false
jobs: jobs:
terraform: terraform:
name: "Terraform Plan" name: "Terraform Plan"
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
env:
TF_VAR_TAILSCALE_KEY: ${{ secrets.TAILSCALE_KEY }}
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: https://gitea.com/actions/checkout@v4 uses: actions/checkout@v4
- name: Create secrets.tfvars
working-directory: terraform
run: |
echo "PM_API_TOKEN_SECRET length: $(echo -n '${{ secrets.PM_API_TOKEN_SECRET }}' | wc -c)"
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
echo "Created secrets.auto.tfvars:"
cat secrets.auto.tfvars | sed 's/=.*/=***/'
echo "Using token ID from terraform.tfvars:"
grep '^pm_api_token_id' terraform.tfvars
- name: Set up Terraform - name: Set up Terraform
uses: hashicorp/setup-terraform@v2 uses: hashicorp/setup-terraform@v2
with: with:
terraform_version: 1.6.6 terraform_version: 1.6.6
terraform_wrapper: false
- name: Inject sensitive secrets
working-directory: terraform
run: |
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
- name: Terraform Init - name: Terraform Init
working-directory: terraform working-directory: terraform
run: terraform init -reconfigure -backend-config=backend.hcl run: terraform init
- name: Terraform Format Check - name: Terraform Format Check
working-directory: terraform working-directory: terraform
@@ -67,35 +50,11 @@ jobs:
- name: Terraform Plan - name: Terraform Plan
working-directory: terraform working-directory: terraform
run: | run: terraform plan -out=tfplan
set -euo pipefail
for attempt in 1 2; do
echo "Terraform plan attempt $attempt/2"
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
exit 0
fi
if [ "$attempt" -eq 1 ]; then
echo "Plan attempt failed or timed out; retrying in 20s"
sleep 20
fi
done
echo "Terraform plan failed after retries"
exit 1
- name: Block accidental destroy - name: Upload Terraform Plan
env: uses: actions/upload-artifact@v3
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }} with:
working-directory: terraform name: terraform-plan
run: | path: terraform/tfplan
terraform show -json -no-color tfplan > tfplan.json
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
echo "Planned deletes: $DESTROY_COUNT"
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
exit 1
fi
# NOTE: Disabled artifact upload for now.
# On this Gitea/act runner, post-job hooks from artifact actions can
# fail during "Complete job" even when all Terraform steps succeeded.
# Re-enable once runner/action compatibility is confirmed.

4
.gitignore vendored
View File

@@ -1,6 +1,2 @@
./terraform/.terraform ./terraform/.terraform
terraform/.terraform/ terraform/.terraform/
terraform/test-apply.sh
terraform/test-plan.sh
terraform/test-destroy.sh
terraform/tfplan

View File

@@ -1,169 +0,0 @@
# Kubeadm Cluster Layout (NixOS)
This folder defines role-based NixOS configs for a kubeadm cluster.
## Topology
- Control planes: `cp-1`, `cp-2`, `cp-3`
- Workers: `wk-1`, `wk-2`, `wk-3`
## What this provides
- Shared Kubernetes/node prerequisites in `modules/k8s-common.nix`
- Shared cluster defaults in `modules/k8s-cluster-settings.nix`
- Role-specific settings for control planes and workers
- Generated per-node host configs from `flake.nix` (no duplicated host files)
- Bootstrap helper commands on each node:
- `th-kubeadm-init`
- `th-kubeadm-join-control-plane`
- `th-kubeadm-join-worker`
- `th-kubeadm-status`
- A Python bootstrap controller for orchestration:
- `bootstrap/controller.py`
## Layered architecture
- `terraform/`: VM lifecycle only
- `nixos/kubeadm/modules/`: declarative node OS config only
- `nixos/kubeadm/bootstrap/controller.py`: imperative cluster reconciliation state machine
## Hardware config files
The flake automatically imports `hosts/hardware/<host>.nix` if present.
Copy each node's generated hardware config into this folder:
```bash
sudo nixos-generate-config
sudo cp /etc/nixos/hardware-configuration.nix ./hosts/hardware/cp-1.nix
```
Repeat for each node (`cp-2`, `cp-3`, `wk-1`, `wk-2`, `wk-3`).
## Deploy approach
Start from one node at a time while experimenting:
```bash
sudo nixos-rebuild switch --flake .#cp-1
```
For remote target-host workflows, use your preferred deploy wrapper later
(`nixos-rebuild --target-host ...` or deploy-rs/colmena).
## Bootstrap runbook (kubeadm + kube-vip + Flannel)
1. Apply Nix config on all nodes (`cp-*`, then `wk-*`).
2. On `cp-1`, run:
```bash
sudo th-kubeadm-init
```
This infers the control-plane VIP as `<node-subnet>.250` on `eth0`, creates the
kube-vip static pod manifest, and runs `kubeadm init`.
3. Install Flannel from `cp-1`:
```bash
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml
```
4. Generate join commands on `cp-1`:
```bash
sudo kubeadm token create --print-join-command
sudo kubeadm init phase upload-certs --upload-certs
```
5. Join `cp-2` and `cp-3`:
```bash
sudo th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'
```
6. Join workers:
```bash
sudo th-kubeadm-join-worker '<kubeadm join ...>'
```
7. Validate from a control plane:
```bash
kubectl get nodes -o wide
kubectl -n kube-system get pods -o wide
```
## Fresh bootstrap flow (recommended)
1. Copy and edit inventory:
```bash
cp ./scripts/inventory.example.env ./scripts/inventory.env
$EDITOR ./scripts/inventory.env
```
2. Rebuild all nodes and bootstrap a fresh cluster:
```bash
./scripts/rebuild-and-bootstrap.sh
```
Optional tuning env vars:
```bash
FAST_MODE=1 WORKER_PARALLELISM=3 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh
```
- `FAST_MODE=1` skips pre-rebuild remote GC cleanup to reduce wall-clock time.
- Set `FAST_MODE=0` for a slower but more aggressive space cleanup pass.
### Bootstrap controller state
The controller stores checkpoints in both places:
- Remote (source of truth): `/var/lib/terrahome/bootstrap-state.json` on `cp-1`
- Local copy (workflow/debug artifact): `nixos/kubeadm/bootstrap/bootstrap-state-last.json`
This makes retries resumable and keeps failure context visible from CI.
3. If you only want to reset Kubernetes state on existing VMs:
```bash
./scripts/reset-cluster-nodes.sh
```
For a full nuke/recreate lifecycle:
- run Terraform destroy/apply for VMs first,
- then run `./scripts/rebuild-and-bootstrap.sh` again.
Node lists now come directly from static Terraform outputs, so bootstrap no longer
depends on Proxmox guest-agent IP discovery or SSH subnet scanning.
## Optional Gitea workflow automation
Primary flow:
- Push to `master` triggers `.gitea/workflows/terraform-apply.yml`
- That workflow now does Terraform apply and then runs a fresh kubeadm bootstrap automatically
Manual dispatch workflows are available:
- `.gitea/workflows/kubeadm-bootstrap.yml`
- `.gitea/workflows/kubeadm-reset.yml`
Required repository secrets:
- Existing Terraform/backend secrets used by current workflows (`B2_*`, `PM_API_TOKEN_SECRET`, `SSH_KEY_PUBLIC`)
- SSH private key: prefer `KUBEADM_SSH_PRIVATE_KEY`, fallback to existing `SSH_KEY_PRIVATE`
Optional secrets:
- `KUBEADM_SSH_USER` (defaults to `micqdf`)
Node IPs are rendered directly from static Terraform outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets or SSH discovery fallbacks.
## Notes
- Scripts are intentionally manual-triggered (predictable for homelab bring-up).
- If `.250` on the node subnet is already in use, change `controlPlaneVipSuffix`
in `modules/k8s-cluster-settings.nix` before bootstrap.

View File

@@ -1,446 +0,0 @@
#!/usr/bin/env python3
import argparse
import base64
import json
import os
import shlex
import subprocess
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
def run_local(cmd, check=True, capture=False):
if isinstance(cmd, str):
shell = True
else:
shell = False
return subprocess.run(
cmd,
shell=shell,
check=check,
text=True,
capture_output=capture,
)
def load_inventory(inventory_file):
inventory_file = Path(inventory_file).resolve()
if not inventory_file.exists():
raise RuntimeError(f"Missing inventory file: {inventory_file}")
cmd = (
"set -a; "
f"source {shlex.quote(str(inventory_file))}; "
"python3 - <<'PY'\n"
"import json, os\n"
"print(json.dumps(dict(os.environ)))\n"
"PY"
)
proc = run_local(["bash", "-lc", cmd], capture=True)
env = json.loads(proc.stdout)
node_ips = {}
cp_names = []
wk_names = []
control_planes = env.get("CONTROL_PLANES", "").strip()
workers = env.get("WORKERS", "").strip()
if control_planes:
for pair in control_planes.split():
name, ip = pair.split("=", 1)
node_ips[name] = ip
cp_names.append(name)
else:
for key in sorted(k for k in env if k.startswith("CP_") and k[3:].isdigit()):
idx = key.split("_", 1)[1]
name = f"cp-{idx}"
node_ips[name] = env[key]
cp_names.append(name)
if workers:
for pair in workers.split():
name, ip = pair.split("=", 1)
node_ips[name] = ip
wk_names.append(name)
else:
for key in sorted(k for k in env if k.startswith("WK_") and k[3:].isdigit()):
idx = key.split("_", 1)[1]
name = f"wk-{idx}"
node_ips[name] = env[key]
wk_names.append(name)
if not cp_names or not wk_names:
raise RuntimeError("Inventory must include control planes and workers")
primary_cp = env.get("PRIMARY_CONTROL_PLANE", "cp-1")
if primary_cp not in node_ips:
primary_cp = cp_names[0]
return {
"env": env,
"node_ips": node_ips,
"cp_names": cp_names,
"wk_names": wk_names,
"primary_cp": primary_cp,
"inventory_file": str(inventory_file),
}
class Controller:
def __init__(self, cfg):
self.env = cfg["env"]
self.node_ips = cfg["node_ips"]
self.cp_names = cfg["cp_names"]
self.wk_names = cfg["wk_names"]
self.primary_cp = cfg["primary_cp"]
self.primary_ip = self.node_ips[self.primary_cp]
self.script_dir = Path(__file__).resolve().parent
self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve()
self.ssh_user = self.env.get("SSH_USER", "micqdf")
self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split()
self.active_ssh_user = self.ssh_user
self.ssh_key = self.env.get("SSH_KEY_PATH", str(Path.home() / ".ssh" / "id_ed25519"))
self.ssh_opts = [
"-o",
"BatchMode=yes",
"-o",
"IdentitiesOnly=yes",
"-o",
"StrictHostKeyChecking=accept-new",
"-i",
self.ssh_key,
]
self.rebuild_timeout = self.env.get("REBUILD_TIMEOUT", "45m")
self.rebuild_retries = int(self.env.get("REBUILD_RETRIES", "2"))
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
self.fast_mode = self.env.get("FAST_MODE", "1")
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
self.force_reinit = True
self.ssh_ready_retries = int(self.env.get("SSH_READY_RETRIES", "20"))
self.ssh_ready_delay = int(self.env.get("SSH_READY_DELAY_SEC", "15"))
def log(self, msg):
print(f"==> {msg}")
def _ssh(self, user, ip, cmd, check=True):
full = ["ssh", *self.ssh_opts, f"{user}@{ip}", f"bash -lc {shlex.quote(cmd)}"]
return run_local(full, check=check, capture=True)
def detect_user(self, ip):
for attempt in range(1, self.ssh_ready_retries + 1):
for user in self.ssh_candidates:
proc = self._ssh(user, ip, "true", check=False)
if proc.returncode == 0:
self.active_ssh_user = user
self.log(f"Using SSH user '{user}' for {ip}")
return
if attempt < self.ssh_ready_retries:
self.log(
f"SSH not ready on {ip} yet; retrying in {self.ssh_ready_delay}s "
f"({attempt}/{self.ssh_ready_retries})"
)
time.sleep(self.ssh_ready_delay)
raise RuntimeError(f"Unable to authenticate to {ip} with users: {', '.join(self.ssh_candidates)}")
def remote(self, ip, cmd, check=True):
ordered = [self.active_ssh_user] + [u for u in self.ssh_candidates if u != self.active_ssh_user]
last = None
for user in ordered:
proc = self._ssh(user, ip, cmd, check=False)
if proc.returncode == 0:
self.active_ssh_user = user
return proc
if proc.returncode != 255:
last = proc
break
last = proc
if check:
stdout = (last.stdout or "").strip()
stderr = (last.stderr or "").strip()
raise RuntimeError(f"Remote command failed on {ip}: {cmd}\n{stdout}\n{stderr}")
return last
def prepare_known_hosts(self):
ssh_dir = Path.home() / ".ssh"
ssh_dir.mkdir(parents=True, exist_ok=True)
(ssh_dir / "known_hosts").touch()
run_local(["chmod", "700", str(ssh_dir)])
run_local(["chmod", "600", str(ssh_dir / "known_hosts")])
for ip in self.node_ips.values():
run_local(["ssh-keygen", "-R", ip], check=False)
run_local(f"ssh-keyscan -H {shlex.quote(ip)} >> {shlex.quote(str(ssh_dir / 'known_hosts'))}", check=False)
def prepare_remote_nix(self, ip):
self.remote(ip, "sudo mkdir -p /etc/nix")
self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi")
self.remote(ip, "echo 'trusted-users = root micqdf' | sudo tee -a /etc/nix/nix.conf >/dev/null")
self.remote(ip, "sudo systemctl restart nix-daemon 2>/dev/null || true")
def prepare_remote_kubelet(self, ip):
self.remote(ip, "sudo systemctl stop kubelet >/dev/null 2>&1 || true")
self.remote(ip, "sudo systemctl disable kubelet >/dev/null 2>&1 || true")
self.remote(ip, "sudo systemctl mask kubelet >/dev/null 2>&1 || true")
self.remote(ip, "sudo systemctl reset-failed kubelet >/dev/null 2>&1 || true")
self.remote(ip, "sudo rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env || true")
def prepare_remote_space(self, ip):
self.remote(ip, "sudo nix-collect-garbage -d || true")
self.remote(ip, "sudo nix --extra-experimental-features nix-command store gc || true")
self.remote(ip, "sudo rm -rf /tmp/nix* /tmp/nixos-rebuild* || true")
def rebuild_node_once(self, name, ip):
self.detect_user(ip)
cmd = [
"timeout",
self.rebuild_timeout,
"nixos-rebuild",
"switch",
"--flake",
f"{self.flake_dir}#{name}",
"--target-host",
f"{self.active_ssh_user}@{ip}",
"--use-remote-sudo",
]
env = os.environ.copy()
env["NIX_SSHOPTS"] = " ".join(self.ssh_opts)
proc = subprocess.run(cmd, text=True, env=env)
return proc.returncode == 0
def rebuild_with_retry(self, name, ip):
max_attempts = self.rebuild_retries + 1
for attempt in range(1, max_attempts + 1):
self.log(f"Rebuild attempt {attempt}/{max_attempts} for {name}")
if self.rebuild_node_once(name, ip):
return
if attempt < max_attempts:
self.log(f"Rebuild failed for {name}, retrying in 20s")
time.sleep(20)
raise RuntimeError(f"Rebuild failed permanently for {name}")
def stage_preflight(self):
self.prepare_known_hosts()
self.detect_user(self.primary_ip)
def stage_rebuild(self):
if self.skip_rebuild:
self.log("Node rebuild already complete")
return
self.detect_user(self.primary_ip)
for name in self.cp_names:
ip = self.node_ips[name]
self.log(f"Preparing and rebuilding {name} ({ip})")
self.prepare_remote_nix(ip)
self.prepare_remote_kubelet(ip)
if self.fast_mode != "1":
self.prepare_remote_space(ip)
self.rebuild_with_retry(name, ip)
for name in self.wk_names:
ip = self.node_ips[name]
self.log(f"Preparing {name} ({ip})")
self.prepare_remote_nix(ip)
self.prepare_remote_kubelet(ip)
if self.fast_mode != "1":
self.prepare_remote_space(ip)
failures = []
with ThreadPoolExecutor(max_workers=self.worker_parallelism) as pool:
futures = {pool.submit(self.rebuild_with_retry, name, self.node_ips[name]): name for name in self.wk_names}
for fut in as_completed(futures):
name = futures[fut]
try:
fut.result()
except Exception as exc:
failures.append((name, str(exc)))
if failures:
raise RuntimeError(f"Worker rebuild failures: {failures}")
def has_admin_conf(self):
return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0
def cluster_ready(self):
cmd = "sudo test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get --raw=/readyz >/dev/null 2>&1"
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
def stage_init_primary(self):
self.log(f"Initializing primary control plane on {self.primary_cp}")
self.remote(self.primary_ip, "sudo th-kubeadm-init")
def stage_install_cni(self):
self.log("Installing Flannel")
manifest_path = self.script_dir.parent / "manifests" / "kube-flannel.yml"
manifest_b64 = base64.b64encode(manifest_path.read_bytes()).decode()
self.remote(
self.primary_ip,
(
"sudo mkdir -p /var/lib/terrahome && "
f"echo {shlex.quote(manifest_b64)} | base64 -d | sudo tee /var/lib/terrahome/kube-flannel.yml >/dev/null"
),
)
self.log("Waiting for API readiness before applying Flannel")
ready = False
for _ in range(30):
if self.cluster_ready():
ready = True
break
time.sleep(10)
if not ready:
raise RuntimeError("API server did not become ready before Flannel install")
last_error = None
for attempt in range(1, 6):
proc = self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f /var/lib/terrahome/kube-flannel.yml",
check=False,
)
if proc.returncode == 0:
return
last_error = (proc.stdout or "") + ("\n" if proc.stdout and proc.stderr else "") + (proc.stderr or "")
self.log(f"Flannel apply attempt {attempt}/5 failed; retrying in 15s")
time.sleep(15)
raise RuntimeError(f"Flannel apply failed after retries\n{last_error or ''}")
def cluster_has_node(self, name):
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
def build_join_cmds(self):
join_cmd = self.remote(
self.primary_ip,
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command",
).stdout.strip()
cert_key = self.remote(
self.primary_ip,
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm init phase upload-certs --upload-certs | tail -n 1",
).stdout.strip()
cp_join = f"{join_cmd} --control-plane --certificate-key {cert_key}"
return join_cmd, cp_join
def stage_join_control_planes(self):
_, cp_join = self.build_join_cmds()
for node in self.cp_names:
if node == self.primary_cp:
continue
if self.cluster_has_node(node):
self.log(f"{node} already joined")
continue
self.log(f"Joining control plane {node}")
ip = self.node_ips[node]
node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR"
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
def stage_join_workers(self):
join_cmd, _ = self.build_join_cmds()
for node in self.wk_names:
if self.cluster_has_node(node):
self.log(f"{node} already joined")
continue
self.log(f"Joining worker {node}")
ip = self.node_ips[node]
node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR"
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
def stage_verify(self):
self.log("Final node verification")
try:
self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel rollout status ds/kube-flannel-ds --timeout=10m",
)
except Exception:
self.log("Flannel rollout failed; collecting diagnostics")
proc = self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get ds -o wide || true",
check=False,
)
print(proc.stdout)
proc = self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o wide || true",
check=False,
)
print(proc.stdout)
proc = self.remote(
self.primary_ip,
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- describe $p ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel describe $p || true; done",
check=False,
)
print(proc.stdout)
proc = self.remote(
self.primary_ip,
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- logs $p kube-flannel ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c kube-flannel --tail=120 || true; echo \"--- logs $p install-cni-plugin ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni-plugin --tail=120 || true; echo \"--- logs $p install-cni ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni --tail=120 || true; done",
check=False,
)
print(proc.stdout)
proc = self.remote(
self.primary_ip,
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs --tail=120 $p || true; done",
check=False,
)
print(proc.stdout)
raise
self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf wait --for=condition=Ready nodes --all --timeout=10m",
)
proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide")
print(proc.stdout)
def reconcile(self):
self.stage_preflight()
self.stage_rebuild()
self.stage_init_primary()
self.stage_install_cni()
self.stage_join_control_planes()
self.stage_join_workers()
self.stage_verify()
def main():
parser = argparse.ArgumentParser(description="TerraHome kubeadm bootstrap controller")
parser.add_argument("command", choices=[
"reconcile",
"preflight",
"rebuild",
"init-primary",
"install-cni",
"join-control-planes",
"join-workers",
"verify",
])
parser.add_argument("--inventory", default=str(Path(__file__).resolve().parent.parent / "scripts" / "inventory.env"))
args = parser.parse_args()
cfg = load_inventory(args.inventory)
ctl = Controller(cfg)
dispatch = {
"reconcile": ctl.reconcile,
"preflight": ctl.stage_preflight,
"rebuild": ctl.stage_rebuild,
"init-primary": ctl.stage_init_primary,
"install-cni": ctl.stage_install_cni,
"join-control-planes": ctl.stage_join_control_planes,
"join-workers": ctl.stage_join_workers,
"verify": ctl.stage_verify,
}
try:
dispatch[args.command]()
except Exception as exc:
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,27 +0,0 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1767313136,
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -1,77 +0,0 @@
{
description = "NixOS kubeadm cluster configs";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
};
outputs = { nixpkgs, ... }:
let
system = "x86_64-linux";
lib = nixpkgs.lib;
pkgs = nixpkgs.legacyPackages.${system};
nodeNames = [ "cp-1" "cp-2" "cp-3" "wk-1" "wk-2" "wk-3" ];
mkNode = {
name,
role,
extraModules ? [ ],
}:
let
roleModule = if role == "control-plane" then ./modules/k8s-control-plane.nix else ./modules/k8s-worker.nix;
hardwarePath = ./hosts/hardware + "/${name}.nix";
in
nixpkgs.lib.nixosSystem {
inherit system;
modules = [
./modules/k8s-cluster-settings.nix
./modules/k8s-common.nix
roleModule
({ lib, ... }: {
imports = lib.optional (builtins.pathExists hardwarePath) hardwarePath;
networking.hostName = name;
system.stateVersion = "25.05";
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};
})
] ++ extraModules;
};
mkNodeByName = name:
mkNode {
inherit name;
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
};
mkEvalCheck = name:
let
cfg = mkNode {
inherit name;
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
extraModules = [
({ lib, ... }: {
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};
})
];
};
in
pkgs.runCommand "eval-${name}" { } ''
cat > "$out" <<'EOF'
host=${cfg.config.networking.hostName}
role=${if lib.hasPrefix "cp-" name then "control-plane" else "worker"}
stateVersion=${cfg.config.system.stateVersion}
EOF
'';
in {
nixosConfigurations = lib.genAttrs nodeNames mkNodeByName;
checks.${system} = lib.genAttrs nodeNames mkEvalCheck;
};
}

View File

@@ -1,212 +0,0 @@
---
kind: Namespace
apiVersion: v1
metadata:
name: kube-flannel
labels:
k8s-app: flannel
pod-security.kubernetes.io/enforce: privileged
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: flannel
subjects:
- kind: ServiceAccount
name: flannel
namespace: kube-flannel
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: flannel
name: flannel
namespace: kube-flannel
---
kind: ConfigMap
apiVersion: v1
metadata:
name: kube-flannel-cfg
namespace: kube-flannel
labels:
tier: node
k8s-app: flannel
app: flannel
data:
cni-conf.json: |
{
"name": "cbr0",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "flannel",
"delegate": {
"hairpinMode": true,
"isDefaultGateway": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
net-conf.json: |
{
"Network": "10.244.0.0/16",
"EnableNFTables": false,
"Backend": {
"Type": "vxlan"
}
}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kube-flannel-ds
namespace: kube-flannel
labels:
tier: node
app: flannel
k8s-app: flannel
spec:
selector:
matchLabels:
app: flannel
template:
metadata:
labels:
tier: node
app: flannel
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
hostNetwork: true
priorityClassName: system-node-critical
tolerations:
- operator: Exists
effect: NoSchedule
serviceAccountName: flannel
initContainers:
- name: install-cni-plugin
image: docker.io/flannel/flannel-cni-plugin:v1.5.1-flannel1
command:
- cp
args:
- -f
- /flannel
- /opt/cni/bin/flannel
volumeMounts:
- name: cni-plugin
mountPath: /opt/cni/bin
- name: install-cni
image: docker.io/flannel/flannel:v0.25.5
command:
- cp
args:
- -f
- /etc/kube-flannel/cni-conf.json
- /etc/cni/net.d/10-flannel.conflist
volumeMounts:
- name: cni
mountPath: /etc/cni/net.d
- name: flannel-cfg
mountPath: /etc/kube-flannel/
containers:
- name: kube-flannel
image: docker.io/flannel/flannel:v0.25.5
command:
- /opt/bin/flanneld
args:
- --ip-masq
- --kube-subnet-mgr
resources:
requests:
cpu: "100m"
memory: "50Mi"
securityContext:
privileged: false
capabilities:
add: ["NET_ADMIN", "NET_RAW"]
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: EVENT_QUEUE_DEPTH
value: "5000"
volumeMounts:
- name: run
mountPath: /run/flannel
- name: flannel-cfg
mountPath: /etc/kube-flannel/
- name: xtables-lock
mountPath: /run/xtables.lock
volumes:
- name: run
hostPath:
path: /run/flannel
type: DirectoryOrCreate
- name: cni-plugin
hostPath:
path: /opt/cni/bin
type: DirectoryOrCreate
- name: cni
hostPath:
path: /etc/cni/net.d
type: DirectoryOrCreate
- name: flannel-cfg
configMap:
name: kube-flannel-cfg
- name: xtables-lock
hostPath:
path: /run/xtables.lock
type: FileOrCreate

View File

@@ -1,12 +0,0 @@
{ ... }:
{
terrahome.kubeadm = {
k8sMinor = "1.31";
controlPlaneInterface = "eth0";
controlPlaneVipSuffix = 250;
podSubnet = "10.244.0.0/16";
serviceSubnet = "10.96.0.0/12";
clusterDomain = "cluster.local";
};
}

View File

@@ -1,420 +0,0 @@
{ config, lib, pkgs, ... }:
let
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
kubeVipImage = "ghcr.io/kube-vip/kube-vip:v0.8.9";
in
{
options.terrahome.kubeadm = {
k8sMinor = lib.mkOption {
type = lib.types.str;
default = "1.31";
};
controlPlaneInterface = lib.mkOption {
type = lib.types.str;
default = "eth0";
};
controlPlaneVipSuffix = lib.mkOption {
type = lib.types.int;
default = 250;
};
podSubnet = lib.mkOption {
type = lib.types.str;
default = "10.244.0.0/16";
};
serviceSubnet = lib.mkOption {
type = lib.types.str;
default = "10.96.0.0/12";
};
clusterDomain = lib.mkOption {
type = lib.types.str;
default = "cluster.local";
};
};
config = {
boot.kernelModules = [ "overlay" "br_netfilter" ];
boot.kernel.sysctl = {
"net.ipv4.ip_forward" = 1;
"net.bridge.bridge-nf-call-iptables" = 1;
"net.bridge.bridge-nf-call-ip6tables" = 1;
};
virtualisation.containerd.enable = true;
virtualisation.containerd.settings = {
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
};
swapDevices = lib.mkForce [ ];
services.openssh.enable = true;
services.openssh.settings = {
PasswordAuthentication = false;
KbdInteractiveAuthentication = false;
};
users.users.micqdf = {
isNormalUser = true;
extraGroups = [ "wheel" ];
};
security.sudo.wheelNeedsPassword = false;
nix.settings.trusted-users = [ "root" "micqdf" ];
nix.gc = {
automatic = true;
dates = "daily";
options = "--delete-older-than 3d";
};
nix.settings.auto-optimise-store = true;
environment.variables = {
KUBECONFIG = "/etc/kubernetes/admin.conf";
KUBE_VIP_IMAGE = kubeVipImage;
};
environment.systemPackages = (with pkgs; [
containerd
cri-tools
cni-plugins
pinnedK8s
kubernetes-helm
conntrack-tools
socat
ethtool
ipvsadm
iproute2
iptables
ebtables
jq
curl
vim
gawk
]) ++ [
(pkgs.writeShellScriptBin "th-kubeadm-init" ''
set -euo pipefail
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
if ! ip link show "$iface" >/dev/null 2>&1; then
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
fi
if [ -z "''${iface:-}" ]; then
echo "Could not determine network interface for kube-vip"
exit 1
fi
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
pod_subnet="${config.terrahome.kubeadm.podSubnet}"
service_subnet="${config.terrahome.kubeadm.serviceSubnet}"
domain="${config.terrahome.kubeadm.clusterDomain}"
node_name="${config.networking.hostName}"
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
if [ -z "''${local_ip_cidr:-}" ]; then
echo "Could not determine IPv4 CIDR on interface $iface"
exit 1
fi
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
vip="$subnet_prefix.$suffix"
echo "Using control-plane endpoint: $vip:6443"
echo "Using kube-vip interface: $iface"
echo "Using kubeadm node name: $node_name"
hostname "$node_name" || true
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
systemctl unmask kubelet || true
systemctl stop kubelet || true
systemctl reset-failed kubelet || true
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
rm -rf /var/lib/kubelet/pki
systemctl daemon-reload
systemctl unmask kubelet || true
systemctl enable kubelet || true
echo "==> Ensuring containerd is running"
systemctl start containerd || true
sleep 2
if ! systemctl is-active containerd; then
echo "ERROR: containerd not running"
journalctl -xeu containerd --no-pager -n 30
exit 1
fi
mkdir -p /etc/kubernetes/manifests
mkdir -p /tmp/kubeadm
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
apiVersion: kubeadm.k8s.io/v1beta4
kind: InitConfiguration
nodeRegistration:
name: "KUBEADM_NODE_NAME"
criSocket: unix:///run/containerd/containerd.sock
kubeletExtraArgs:
- name: hostname-override
value: "KUBEADM_NODE_NAME"
---
apiVersion: kubeadm.k8s.io/v1beta4
kind: ClusterConfiguration
controlPlaneEndpoint: "KUBEADM_ENDPOINT"
networking:
podSubnet: "KUBEADM_POD_SUBNET"
serviceSubnet: "KUBEADM_SERVICE_SUBNET"
dnsDomain: "KUBEADM_DNS_DOMAIN"
KUBEADMCONFIG
sed -i "s|KUBEADM_ENDPOINT|$vip:6443|g" /tmp/kubeadm/init-config.yaml
sed -i "s|KUBEADM_POD_SUBNET|$pod_subnet|g" /tmp/kubeadm/init-config.yaml
sed -i "s|KUBEADM_SERVICE_SUBNET|$service_subnet|g" /tmp/kubeadm/init-config.yaml
sed -i "s|KUBEADM_DNS_DOMAIN|$domain|g" /tmp/kubeadm/init-config.yaml
sed -i "s|KUBEADM_NODE_NAME|$node_name|g" /tmp/kubeadm/init-config.yaml
echo "==> Pre-pulling kubeadm images"
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
echo "==> Creating kube-vip static pod manifest"
ctr image pull "${kubeVipImage}"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--log 4 \
--interface "$iface" \
--address "$vip" \
--controlplane \
--arp \
> /etc/kubernetes/manifests/kube-vip.yaml
# kube-vip bootstrap workaround for Kubernetes >=1.29.
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
sed -i 's#path: /etc/kubernetes/admin.conf#path: /etc/kubernetes/super-admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
echo "==> kube-vip manifest kubeconfig mount"
grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true
KUBEADM_INIT_LOG=/tmp/kubeadm-init.log
if ! env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
--config /tmp/kubeadm/init-config.yaml \
--upload-certs \
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
echo "==> kubeadm hit CRISocket race; waiting for node registration"
echo "==> forcing kubelet restart to pick bootstrap flags"
systemctl daemon-reload || true
systemctl restart kubelet || true
sleep 3
echo "==> kubelet bootstrap flags"
cat /var/lib/kubelet/kubeadm-flags.env || true
registered=0
for i in $(seq 1 60); do
if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
echo "==> node $node_name registered; uploading kubelet config"
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init phase upload-config kubelet --config /tmp/kubeadm/init-config.yaml
registered=1
break
fi
sleep 2
done
if [ "$registered" -ne 1 ]; then
echo "==> node $node_name did not register after kubeadm init failure"
KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
echo "==> kubelet logs (registration hints)"
journalctl -u kubelet --no-pager -n 120 | grep -Ei "register|node|bootstrap|certificate|forbidden|unauthorized|refused|x509" || true
exit 1
fi
else
echo "==> kubeadm init failed, checking pod status:"
crictl pods || true
crictl ps -a || true
echo "==> kube-vip containers:"
crictl ps -a --name kube-vip || true
echo "==> kube-vip logs:"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true
done
echo "==> Checking if VIP is bound:"
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
echo "==> kubelet logs:"
journalctl -xeu kubelet --no-pager -n 50
exit 1
fi
fi
echo "==> Waiting for kube-vip to claim VIP $vip"
for i in $(seq 1 90); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
if [ "$i" -eq 90 ]; then
echo "==> ERROR: VIP not bound after 3 minutes"
crictl ps -a --name kube-vip || true
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
exit 1
fi
sleep 2
done
echo "==> Waiting for API server to be ready"
for i in $(seq 1 60); do
if curl -sk "https://$vip:6443/healthz" 2>/dev/null | grep -q "ok"; then
echo "==> API server is healthy"
break
fi
if [ "$i" -eq 60 ]; then
echo "==> ERROR: API server not healthy after 2 minutes"
crictl pods || true
crictl ps -a || true
exit 1
fi
sleep 2
done
# Switch kube-vip to normal admin.conf after bootstrap finishes.
sed -i 's#path: /etc/kubernetes/super-admin.conf#path: /etc/kubernetes/admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
mkdir -p /root/.kube
cp /etc/kubernetes/admin.conf /root/.kube/config
chmod 600 /root/.kube/config
echo
echo "Next: install Cilium, then generate join commands:"
echo " kubeadm token create --print-join-command"
echo " kubeadm token create --print-join-command --certificate-key <key>"
'')
(pkgs.writeShellScriptBin "th-kubeadm-join-control-plane" ''
set -euo pipefail
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
if [ "$#" -lt 1 ]; then
echo "Usage: th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'"
exit 1
fi
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
if ! ip link show "$iface" >/dev/null 2>&1; then
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
fi
if [ -z "''${iface:-}" ]; then
echo "Could not determine network interface for kube-vip"
exit 1
fi
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
if [ -z "''${local_ip_cidr:-}" ]; then
echo "Could not determine IPv4 CIDR on interface $iface"
exit 1
fi
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
vip="$subnet_prefix.$suffix"
mkdir -p /etc/kubernetes/manifests
ctr image pull "${kubeVipImage}"
ctr run --rm --net-host "${kubeVipImage}" kube-vip /kube-vip manifest pod \
--log 4 \
--interface "$iface" \
--address "$vip" \
--controlplane \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
rm -rf /var/lib/kubelet/pki
systemctl unmask kubelet || true
systemctl stop kubelet || true
systemctl enable kubelet || true
systemctl reset-failed kubelet || true
systemctl daemon-reload
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
eval "$1"
'')
(pkgs.writeShellScriptBin "th-kubeadm-join-worker" ''
set -euo pipefail
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
if [ "$#" -lt 1 ]; then
echo "Usage: th-kubeadm-join-worker '<kubeadm join ...>'"
exit 1
fi
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
rm -rf /var/lib/kubelet/pki
systemctl unmask kubelet || true
systemctl stop kubelet || true
systemctl enable kubelet || true
systemctl reset-failed kubelet || true
systemctl daemon-reload
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
eval "$1"
'')
(pkgs.writeShellScriptBin "th-kubeadm-status" ''
set -euo pipefail
systemctl is-active containerd || true
systemctl is-active kubelet || true
crictl info >/dev/null && echo "crictl: ok" || echo "crictl: not-ready"
'')
];
systemd.services.kubelet = {
description = "Kubernetes Kubelet";
wantedBy = [ "multi-user.target" ];
path = [ pkgs.util-linux ];
wants = [ "network-online.target" ];
after = [ "containerd.service" "network-online.target" ];
serviceConfig = {
Environment = [
"KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
"KUBELET_KUBEADM_ARGS="
"KUBELET_EXTRA_ARGS="
];
EnvironmentFile = [
"-/var/lib/kubelet/kubeadm-flags.env"
"-/etc/default/kubelet"
];
ExecStart = "${pinnedK8s}/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf \$KUBELET_CONFIG_ARGS \$KUBELET_KUBEADM_ARGS \$KUBELET_EXTRA_ARGS";
Restart = "on-failure";
RestartSec = "10";
};
unitConfig = {
ConditionPathExists = "/var/lib/kubelet/config.yaml";
ConditionPathExistsGlob = "/etc/kubernetes/*kubelet.conf";
};
};
systemd.tmpfiles.rules = [
"d /etc/kubernetes 0755 root root -"
"d /etc/kubernetes/manifests 0755 root root -"
"d /etc/cni/net.d 0755 root root -"
"d /opt/cni/bin 0755 root root -"
"d /run/flannel 0755 root root -"
"d /var/lib/kubelet 0755 root root -"
"d /var/lib/kubelet/pki 0755 root root -"
];
};
}

View File

@@ -1,14 +0,0 @@
{
networking.firewall.allowedTCPPorts = [
6443
2379
2380
10250
10257
10259
];
networking.firewall.allowedUDPPorts = [
8472
];
}

View File

@@ -1,11 +0,0 @@
{
networking.firewall.allowedTCPPorts = [
10250
30000
32767
];
networking.firewall.allowedUDPPorts = [
8472
];
}

View File

@@ -1,182 +0,0 @@
#!/usr/bin/env python3
import concurrent.futures
import ipaddress
import json
import os
import subprocess
import sys
from typing import Dict, Set, Tuple
def derive_prefix(payload: dict) -> str:
explicit = os.environ.get("KUBEADM_SUBNET_PREFIX", "").strip()
if explicit:
return explicit
for key in ("control_plane_vm_ipv4", "worker_vm_ipv4"):
values = payload.get(key, {}).get("value", {})
for ip in values.values():
if ip:
parts = ip.split(".")
if len(parts) == 4:
return ".".join(parts[:3])
return "10.27.27"
def ssh_probe(ip: str, users: list[str], key_path: str, timeout_sec: int) -> Tuple[str, str, str] | None:
cmd_tail = [
"-o",
"BatchMode=yes",
"-o",
"IdentitiesOnly=yes",
"-o",
"StrictHostKeyChecking=accept-new",
"-o",
f"ConnectTimeout={timeout_sec}",
"-i",
key_path,
]
for user in users:
cmd = [
"ssh",
*cmd_tail,
f"{user}@{ip}",
"hn=$(hostnamectl --static 2>/dev/null || hostname); serial=$(cat /sys/class/dmi/id/product_serial 2>/dev/null || true); printf '%s|%s\n' \"$hn\" \"$serial\"",
]
try:
out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=timeout_sec + 2).strip()
except Exception:
continue
if out:
line = out.splitlines()[0].strip()
if "|" in line:
host, serial = line.split("|", 1)
else:
host, serial = line, ""
return host.strip(), ip, serial.strip()
return None
def build_inventory(names: Set[str], found: Dict[str, str], ssh_user: str) -> str:
cp = sorted([n for n in names if n.startswith("cp-")], key=lambda x: int(x.split("-")[1]))
wk = sorted([n for n in names if n.startswith("wk-")], key=lambda x: int(x.split("-")[1]))
cp_pairs = " ".join(f"{n}={found[n]}" for n in cp)
wk_pairs = " ".join(f"{n}={found[n]}" for n in wk)
primary = cp[0] if cp else "cp-1"
return "\n".join(
[
f"SSH_USER={ssh_user}",
f"PRIMARY_CONTROL_PLANE={primary}",
f'CONTROL_PLANES="{cp_pairs}"',
f'WORKERS="{wk_pairs}"',
"",
]
)
def main() -> int:
payload = json.load(sys.stdin)
cp_names = set(payload.get("control_plane_vm_ids", {}).get("value", {}).keys())
wk_names = set(payload.get("worker_vm_ids", {}).get("value", {}).keys())
target_names = cp_names | wk_names
if not target_names:
raise SystemExit("Could not determine target node names from Terraform outputs")
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6"))
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32"))
prefix = derive_prefix(payload)
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
end = int(os.environ.get("KUBEADM_SUBNET_END", "254"))
vip_suffix = int(os.environ.get("KUBEADM_CONTROL_PLANE_VIP_SUFFIX", "250"))
def is_vip_ip(ip: str) -> bool:
try:
return int(ip.split(".")[-1]) == vip_suffix
except Exception:
return False
scan_ips = [
str(ipaddress.IPv4Address(f"{prefix}.{i}"))
for i in range(start, end + 1)
if i != vip_suffix
]
found: Dict[str, str] = {}
vmid_to_name: Dict[str, str] = {}
for name, vmid in payload.get("control_plane_vm_ids", {}).get("value", {}).items():
vmid_to_name[str(vmid)] = name
for name, vmid in payload.get("worker_vm_ids", {}).get("value", {}).items():
vmid_to_name[str(vmid)] = name
seen_hostnames: Dict[str, str] = {}
seen_ips: Dict[str, Tuple[str, str]] = {}
def run_pass(pass_timeout: int, pass_workers: int) -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool:
futures = [pool.submit(ssh_probe, ip, users, key_path, pass_timeout) for ip in scan_ips]
for fut in concurrent.futures.as_completed(futures):
result = fut.result()
if not result:
continue
host, ip, serial = result
if host not in seen_hostnames:
seen_hostnames[host] = ip
if ip not in seen_ips:
seen_ips[ip] = (host, serial)
target = None
if serial in vmid_to_name:
inferred = vmid_to_name[serial]
target = inferred
elif host in target_names:
target = host
if target:
existing = found.get(target)
if existing is None or (is_vip_ip(existing) and not is_vip_ip(ip)):
found[target] = ip
if all(name in found for name in target_names):
return
run_pass(timeout_sec, max_workers)
if not all(name in found for name in target_names):
# Slower second pass for busy runners/networks.
run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2))
# Heuristic fallback: if nodes still missing, assign from remaining SSH-reachable
# IPs not already used, ordered by IP. This helps when cloned nodes temporarily
# share a generic hostname (e.g. "flex") and DMI serial mapping is unavailable.
missing = sorted([n for n in target_names if n not in found])
if missing:
used_ips = set(found.values())
candidates = sorted(ip for ip in seen_ips.keys() if ip not in used_ips)
if len(candidates) >= len(missing):
for name, ip in zip(missing, candidates):
found[name] = ip
missing = sorted([n for n in target_names if n not in found])
if missing:
discovered = ", ".join(sorted(seen_hostnames.keys())[:20])
if discovered:
sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n")
if seen_ips:
sample = ", ".join(f"{ip}={meta[0]}" for ip, meta in list(sorted(seen_ips.items()))[:20])
sys.stderr.write(f"SSH-reachable IPs: {sample}\n")
raise SystemExit(
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
f" (scanned {prefix}.{start}-{prefix}.{end})"
)
sys.stdout.write(build_inventory(target_names, found, ssh_user))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,7 +0,0 @@
SSH_USER=micqdf
PRIMARY_CONTROL_PLANE=cp-1
# Name=IP pairs (space-separated)
CONTROL_PLANES="cp-1=192.168.1.101 cp-2=192.168.1.102 cp-3=192.168.1.103"
WORKERS="wk-1=192.168.1.111 wk-2=192.168.1.112 wk-3=192.168.1.113"

View File

@@ -1,14 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
CONTROLLER="$SCRIPT_DIR/../bootstrap/controller.py"
if [ ! -f "$INVENTORY_FILE" ]; then
echo "Missing inventory file: $INVENTORY_FILE"
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
exit 1
fi
python3 "$CONTROLLER" reconcile --inventory "$INVENTORY_FILE"

View File

@@ -1,65 +0,0 @@
#!/usr/bin/env python3
import json
import os
import re
import sys
def natural_key(name: str):
m = re.match(r"^([a-zA-Z-]+)-(\d+)$", name)
if m:
return (m.group(1), int(m.group(2)))
return (name, 0)
def map_to_pairs(items: dict[str, str]) -> str:
ordered = sorted(items.items(), key=lambda kv: natural_key(kv[0]))
return " ".join(f"{k}={v}" for k, v in ordered)
def require_non_empty_ips(label: str, items: dict[str, str]) -> dict[str, str]:
cleaned: dict[str, str] = {}
missing: list[str] = []
for name, ip in items.items():
ip_value = (ip or "").strip()
if not ip_value:
missing.append(name)
continue
cleaned[name] = ip_value
if missing:
names = ", ".join(sorted(missing, key=natural_key))
raise SystemExit(
f"Missing IPv4 addresses for {label}: {names}. "
"Terraform outputs are present but empty. "
"This usually means Proxmox guest IP discovery is unavailable for these VMs yet."
)
return cleaned
def main() -> int:
payload = json.load(sys.stdin)
cp_map = payload.get("control_plane_vm_ipv4", {}).get("value", {})
wk_map = payload.get("worker_vm_ipv4", {}).get("value", {})
if not cp_map or not wk_map:
raise SystemExit("Missing control_plane_vm_ipv4 or worker_vm_ipv4 in terraform output")
cp_map = require_non_empty_ips("control planes", cp_map)
wk_map = require_non_empty_ips("workers", wk_map)
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
print(f"SSH_USER={ssh_user}")
print("PRIMARY_CONTROL_PLANE=cp-1")
print(f"CONTROL_PLANES=\"{map_to_pairs(cp_map)}\"")
print(f"WORKERS=\"{map_to_pairs(wk_map)}\"")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,106 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
if [ ! -f "$INVENTORY_FILE" ]; then
echo "Missing inventory file: $INVENTORY_FILE"
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
exit 1
fi
# shellcheck disable=SC1090
source "$INVENTORY_FILE"
SSH_USER="${SSH_USER:-micqdf}"
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
declare -A NODE_IPS=()
add_pair() {
local pair="$1"
local name="${pair%%=*}"
local ip="${pair#*=}"
if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then
echo "Invalid node pair '$pair' (expected name=ip)."
exit 1
fi
NODE_IPS["$name"]="$ip"
}
if [ -n "${CONTROL_PLANES:-}" ]; then
for pair in $CONTROL_PLANES; do
add_pair "$pair"
done
else
while IFS= read -r var_name; do
idx="${var_name#CP_}"
add_pair "cp-$idx=${!var_name}"
done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V)
fi
if [ -n "${WORKERS:-}" ]; then
for pair in $WORKERS; do
add_pair "$pair"
done
else
while IFS= read -r var_name; do
idx="${var_name#WK_}"
add_pair "wk-$idx=${!var_name}"
done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V)
fi
if [ "${#NODE_IPS[@]}" -eq 0 ]; then
echo "No nodes found in inventory."
exit 1
fi
detect_ssh_user() {
local probe_ip="$1"
local candidate
for candidate in $SSH_USER_CANDIDATES; do
if ssh $SSH_OPTS "$candidate@$probe_ip" "true" >/dev/null 2>&1; then
ACTIVE_SSH_USER="$candidate"
echo "==> Using SSH user '$ACTIVE_SSH_USER'"
return 0
fi
done
echo "Unable to authenticate to $probe_ip with candidates: $SSH_USER_CANDIDATES"
return 1
}
mkdir -p "$HOME/.ssh"
chmod 700 "$HOME/.ssh"
touch "$HOME/.ssh/known_hosts"
chmod 600 "$HOME/.ssh/known_hosts"
for node_name in "${!NODE_IPS[@]}"; do
ssh-keygen -R "${NODE_IPS[$node_name]}" >/dev/null 2>&1 || true
ssh-keyscan -H "${NODE_IPS[$node_name]}" >> "$HOME/.ssh/known_hosts" 2>/dev/null || true
done
reset_node() {
local node_name="$1"
local node_ip="$2"
echo "==> Resetting $node_name ($node_ip)"
local cmd="sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d"
local quoted_cmd
quoted_cmd="$(printf '%q' "$cmd")"
ssh $SSH_OPTS "$ACTIVE_SSH_USER@$node_ip" "bash -lc $quoted_cmd"
}
FIRST_NODE_IP="${NODE_IPS[$(printf '%s\n' "${!NODE_IPS[@]}" | sort -V | head -n1)]}"
ACTIVE_SSH_USER="$SSH_USER"
detect_ssh_user "$FIRST_NODE_IP"
while IFS= read -r node_name; do
reset_node "$node_name" "${NODE_IPS[$node_name]}"
done < <(printf '%s\n' "${!NODE_IPS[@]}" | sort -V)
echo "Cluster components reset on all listed nodes."

View File

@@ -1,27 +0,0 @@
# NixOS Proxmox k8s-base Template
This folder contains a Kubernetes-ready NixOS base config for your Proxmox
template VM build.
## Files
- `flake.nix`: pins `nixos-25.05` and exposes one host config.
- `configuration.nix`: k8s-base settings for Proxmox guests.
## Before first apply
1. Add `hardware-configuration.nix` from the VM install:
- `nixos-generate-config --root /`
- copy `/etc/nixos/hardware-configuration.nix` next to `configuration.nix`
## Build/apply example inside the VM
```bash
sudo nixos-rebuild switch --flake .#template
```
## Notes
- This pre-installs heavy shared Kubernetes dependencies (containerd + kube tools)
to reduce per-node bootstrap time.
- Cloud-init still injects the runtime SSH key and per-node hostname/IP.

View File

@@ -1,99 +0,0 @@
{ lib, pkgs, ... }:
let
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
in
{
imports =
lib.optional (builtins.pathExists ./hardware-configuration.nix)
./hardware-configuration.nix;
networking.hostName = "k8s-base-template";
networking.useDHCP = false;
networking.useNetworkd = true;
networking.nameservers = [ "1.1.1.1" "8.8.8.8" ];
boot.loader.systemd-boot.enable = lib.mkForce false;
boot.loader.grub = {
enable = true;
device = "/dev/sda";
};
services.qemuGuest.enable = true;
services.cloud-init.enable = true;
services.cloud-init.network.enable = true;
services.openssh.enable = true;
services.openssh.settings = {
PasswordAuthentication = false;
KbdInteractiveAuthentication = false;
PermitRootLogin = "prohibit-password";
};
boot.kernelModules = [ "overlay" "br_netfilter" ];
boot.kernel.sysctl = {
"net.ipv4.ip_forward" = 1;
"net.bridge.bridge-nf-call-iptables" = 1;
"net.bridge.bridge-nf-call-ip6tables" = 1;
};
virtualisation.containerd.enable = true;
virtualisation.containerd.settings = {
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
};
swapDevices = lib.mkForce [ ];
nix.settings = {
trusted-users = [ "root" "micqdf" ];
auto-optimise-store = true;
};
nix.gc = {
automatic = true;
dates = "daily";
options = "--delete-older-than 3d";
};
programs.fish.enable = true;
users.users.micqdf = {
isNormalUser = true;
extraGroups = [ "wheel" ];
shell = pkgs.fish;
};
security.sudo.wheelNeedsPassword = false;
environment.systemPackages = with pkgs; [
btop
cni-plugins
conntrack-tools
containerd
cri-tools
curl
dig
ebtables
ethtool
eza
fd
fzf
git
htop
iproute2
iptables
ipvsadm
jq
kubernetes-helm
pinnedK8s
ripgrep
socat
tree
unzip
vim
neovim
wget
];
system.stateVersion = "25.05";
}

View File

@@ -1,27 +0,0 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1767313136,
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -1,14 +0,0 @@
{
description = "Kubernetes-ready NixOS base template";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
};
outputs = { nixpkgs, ... }: {
nixosConfigurations.template = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./configuration.nix ];
};
};
}

View File

@@ -1,24 +1,79 @@
# This file is maintained automatically by "terraform init". # This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates. # Manual edits may be lost in future updates.
provider "registry.terraform.io/telmate/proxmox" { provider "registry.terraform.io/hashicorp/local" {
version = "3.0.2-rc07" version = "2.5.2"
constraints = "3.0.2-rc07"
hashes = [ hashes = [
"h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", "h1:JlMZD6nYqJ8sSrFfEAH0Vk/SL8WLZRmFaMUF9PJK5wM=",
"zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca", "zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511",
"zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c", "zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea",
"zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd", "zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0",
"zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b", "zh:4b8cd2583d1edcac4011caafe8afb7a95e8110a607a1d5fb87d921178074a69b",
"zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea", "zh:52084ddaff8c8cd3f9e7bcb7ce4dc1eab00602912c96da43c29b4762dc376038",
"zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb", "zh:71562d330d3f92d79b2952ffdda0dad167e952e46200c767dd30c6af8d7c0ed3",
"zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
"zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42", "zh:805f81ade06ff68fa8b908d31892eaed5c180ae031c77ad35f82cb7a74b97cf4",
"zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae", "zh:8b6b3ebeaaa8e38dd04e56996abe80db9be6f4c1df75ac3cccc77642899bd464",
"zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26", "zh:ad07750576b99248037b897de71113cc19b1a8d0bc235eb99173cc83d0de3b1b",
"zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f", "zh:b9f1c3bfadb74068f5c205292badb0661e17ac05eb23bfe8bd809691e4583d0e",
"zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2", "zh:cc4cbcd67414fefb111c1bf7ab0bc4beb8c0b553d01719ad17de9a047adff4d1",
"zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c", ]
"zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db", }
provider "registry.terraform.io/hashicorp/null" {
version = "3.2.3"
hashes = [
"h1:+AnORRgFbRO6qqcfaQyeX80W0eX3VmjadjnUFUJTiXo=",
"zh:22d062e5278d872fe7aed834f5577ba0a5afe34a3bdac2b81f828d8d3e6706d2",
"zh:23dead00493ad863729495dc212fd6c29b8293e707b055ce5ba21ee453ce552d",
"zh:28299accf21763ca1ca144d8f660688d7c2ad0b105b7202554ca60b02a3856d3",
"zh:55c9e8a9ac25a7652df8c51a8a9a422bd67d784061b1de2dc9fe6c3cb4e77f2f",
"zh:756586535d11698a216291c06b9ed8a5cc6a4ec43eee1ee09ecd5c6a9e297ac1",
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
"zh:9d5eea62fdb587eeb96a8c4d782459f4e6b73baeece4d04b4a40e44faaee9301",
"zh:a6355f596a3fb8fc85c2fb054ab14e722991533f87f928e7169a486462c74670",
"zh:b5a65a789cff4ada58a5baffc76cb9767dc26ec6b45c00d2ec8b1b027f6db4ed",
"zh:db5ab669cf11d0e9f81dc380a6fdfcac437aea3d69109c7aef1a5426639d2d65",
"zh:de655d251c470197bcbb5ac45d289595295acb8f829f6c781d4a75c8c8b7c7dd",
"zh:f5c68199f2e6076bce92a12230434782bf768103a427e9bb9abee99b116af7b5",
]
}
provider "registry.terraform.io/hashicorp/template" {
version = "2.2.0"
hashes = [
"h1:94qn780bi1qjrbC3uQtjJh3Wkfwd5+tTtJHOb7KTg9w=",
"zh:01702196f0a0492ec07917db7aaa595843d8f171dc195f4c988d2ffca2a06386",
"zh:09aae3da826ba3d7df69efeb25d146a1de0d03e951d35019a0f80e4f58c89b53",
"zh:09ba83c0625b6fe0a954da6fbd0c355ac0b7f07f86c91a2a97849140fea49603",
"zh:0e3a6c8e16f17f19010accd0844187d524580d9fdb0731f675ffcf4afba03d16",
"zh:45f2c594b6f2f34ea663704cc72048b212fe7d16fb4cfd959365fa997228a776",
"zh:77ea3e5a0446784d77114b5e851c970a3dde1e08fa6de38210b8385d7605d451",
"zh:8a154388f3708e3df5a69122a23bdfaf760a523788a5081976b3d5616f7d30ae",
"zh:992843002f2db5a11e626b3fc23dc0c87ad3729b3b3cff08e32ffb3df97edbde",
"zh:ad906f4cebd3ec5e43d5cd6dc8f4c5c9cc3b33d2243c89c5fc18f97f7277b51d",
"zh:c979425ddb256511137ecd093e23283234da0154b7fa8b21c2687182d9aea8b2",
]
}
provider "registry.terraform.io/telmate/proxmox" {
version = "3.0.1-rc8"
constraints = "3.0.1-rc8"
hashes = [
"h1:W5X4T5AZUaqO++aAequNECUKJaXLC5upcws6Vp7mkBk=",
"zh:0272f1600251abf9b139c2683f83cde0a907ac762f5ead058b84de18ddc1d78e",
"zh:328e708a8063a133516612b17c8983a9372fa42766530925d1d37aeb1daa30ec",
"zh:3449150e4d57f79af6f9583e93e3a5ab84fb475bc594de75b968534f57af2871",
"zh:58d803a0203241214f673c80350d43ce1a5ce57b21b83ba08d0d08e8c389dcc4",
"zh:59e3e99afc1ea404e530100725403c1610d682cfd27eeeaf35190c119b76a4db",
"zh:666cb7d299824152714202e8fda000c2e37346f2ae6d0a0e3c6f6bd68ef5d9ca",
"zh:6a1290b85e7bf953664b21b2a1ea554923a060f2a8347d8d5bb3d2b5157f85d2",
"zh:72230960c49fe7050a5e80ee10fa24cdac94dbab82744bccb6aa251741eb5aa9",
"zh:91f655c41f5af9a9fdcf6104c3d0a553eaa0fb3390af81051e744f30accd5b52",
"zh:aa08a22bf737d5840573bb6030617ab6bba2a292f4b9c88b20477cdcfb9676a9",
"zh:b72012cc284cad488207532b6668c58999c972d837b5f486db1d7466d686d5fd",
"zh:e24f934249a6ab4d3705c1398226d4d9df1e81ef8a36592389be02bc35cc661f",
"zh:e9e6bcef8b6a6b5ff2317168c2c23e4c55ae23f883ba158d2c4fd6324a0413e5",
"zh:ffa1e742a8c50babd8dbfcd6884740f9bea8453ec4d832717ff006a4fbfffa91",
] ]
} }

70
terraform/cloud-init.tf Normal file
View File

@@ -0,0 +1,70 @@
### Alpaca cloud-init template
data "template_file" "cloud_init_alpaca" {
count = var.alpaca_vm_count
template = file("${path.module}/files/cloud_init.yaml")
vars = {
ssh_key = var.ssh_key
hostname = "alpaca-${count.index + 1}"
domain = "home.arpa"
TS_AUTHKEY = var.TS_AUTHKEY
}
}
resource "local_file" "cloud_init_alpaca" {
count = var.alpaca_vm_count
content = data.template_file.cloud_init_alpaca[count.index].rendered
filename = "${path.module}/files/cloud_init_alpaca_${count.index + 1}.yaml"
}
resource "null_resource" "upload_cloud_init_alpaca" {
count = var.alpaca_vm_count
connection {
type = "ssh"
user = "root"
host = var.target_node
}
provisioner "file" {
source = local_file.cloud_init_alpaca[count.index].filename
destination = "/var/lib/vz/snippets/cloud_init_alpaca_${count.index + 1}.yaml"
}
}
### Llama cloud-init template
data "template_file" "cloud_init_llama" {
count = var.llama_vm_count
template = file("${path.module}/files/cloud_init.yaml")
vars = {
ssh_key = var.ssh_key
hostname = "llama-${count.index + 1}"
domain = "home.arpa"
TS_AUTHKEY = var.TS_AUTHKEY
}
}
resource "local_file" "cloud_init_llama" {
count = var.llama_vm_count
content = data.template_file.cloud_init_llama[count.index].rendered
filename = "${path.module}/files/cloud_init_llama_${count.index + 1}.yaml"
}
resource "null_resource" "upload_cloud_init_llama" {
count = var.llama_vm_count
connection {
type = "ssh"
user = "root"
host = var.target_node
}
provisioner "file" {
source = local_file.cloud_init_llama[count.index].filename
destination = "/var/lib/vz/snippets/cloud_init_llama_${count.index + 1}.yaml"
}
}

View File

@@ -1,9 +1,10 @@
#cloud-config #cloud-config
hostname: ${hostname} hostname: ${hostname}
fqdn: ${hostname}.${domain} fqdn: ${hostname}.${domain}
ssh_authorized_keys:
- ${ssh_key}
runcmd: runcmd:
- curl -fsSL https://tailscale.com/install.sh | sh - curl -fsSL https://tailscale.com/install.sh | sh
- tailscale up --auth-key=${TS_AUTHKEY} - tailscale up --auth-key=${TS_AUTHKEY}
- tailscale set --ssh

View File

@@ -1,6 +0,0 @@
#cloud-config
runcmd:
- curl -fsSL https://tailscale.com/install.sh | sh
- tailscale up --auth-key=${TS_AUTHKEY}
- tailscale set --ssh

View File

@@ -1,15 +0,0 @@
#cloud-config
hostname: ${hostname}
manage_etc_hosts: true
resolv_conf:
nameservers:
- 8.8.8.8
- 1.1.1.1
preserve_hostname: false
fqdn: ${hostname}.${domain}
users:
- name: micqdf
ssh_authorized_keys:
- ${SSH_KEY_PUBLIC}

View File

@@ -1,71 +1,42 @@
terraform { terraform {
backend "s3" {}
required_providers { required_providers {
proxmox = { proxmox = {
source = "Telmate/proxmox" source = "Telmate/proxmox"
version = "3.0.2-rc07" version = "3.0.1-rc8"
} }
} }
} }
locals {
control_plane_ipconfig = [
for ip in var.control_plane_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
]
worker_ipconfig = [
for ip in var.worker_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
]
}
provider "proxmox" { provider "proxmox" {
pm_api_url = var.pm_api_url pm_api_url = var.pm_api_url
pm_api_token_id = var.pm_api_token_id pm_user = var.pm_user
pm_api_token_secret = var.pm_api_token_secret pm_password = var.proxmox_password
pm_tls_insecure = true pm_tls_insecure = true
} }
resource "proxmox_vm_qemu" "control_planes" { resource "proxmox_vm_qemu" "alpacas" {
count = var.control_plane_count count = var.alpaca_vm_count
name = "cp-${count.index + 1}" name = "alpaca-${count.index + 1}"
vmid = var.control_plane_vmid_start + count.index vmid = 500 + count.index + 1
target_node = var.target_node target_node = var.target_node
clone = var.clone_template clone = var.clone_template
full_clone = true full_clone = false
os_type = "cloud-init" agent = 1
agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true
cpu { sockets = var.sockets
sockets = 1 cores = var.cores
cores = var.control_plane_cores memory = var.memory
} scsihw = "virtio-scsi-pci"
memory = var.control_plane_memory_mb boot = "order=scsi0"
scsihw = "virtio-scsi-pci" ipconfig0 = "ip=dhcp"
boot = "order=scsi0" cicustom = "user=local:snippets/cloud_init_alpaca_${count.index + 1}.yaml"
bootdisk = "scsi0" depends_on = [null_resource.upload_cloud_init_alpaca]
ipconfig0 = local.control_plane_ipconfig[count.index]
ciuser = "micqdf"
sshkeys = var.SSH_KEY_PUBLIC
disk {
disks { slot = "scsi0"
scsi { type = "disk"
scsi0 { storage = var.storage
disk { size = var.disk_size
size = var.control_plane_disk_size
storage = var.storage
}
}
}
ide {
ide2 {
cloudinit {
storage = var.storage
}
}
}
} }
network { network {
@@ -73,63 +44,38 @@ resource "proxmox_vm_qemu" "control_planes" {
model = "virtio" model = "virtio"
bridge = var.bridge bridge = var.bridge
} }
lifecycle {
ignore_changes = all
}
} }
resource "proxmox_vm_qemu" "workers" { resource "proxmox_vm_qemu" "llamas" {
count = var.worker_count count = var.llama_vm_count
name = "wk-${count.index + 1}" name = "llama-${count.index + 1}"
vmid = var.worker_vmid_start + count.index vmid = 600 + count.index + 1
target_node = var.target_node target_node = var.target_node
clone = var.clone_template clone = var.clone_template
full_clone = true full_clone = false
os_type = "cloud-init" agent = 1
agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true
cpu { sockets = var.sockets
sockets = 1 cores = var.cores
cores = var.worker_cores[count.index] memory = var.memory
scsihw = "virtio-scsi-pci"
boot = "order=scsi0"
ipconfig0 = "ip=dhcp"
cicustom = "user=local:snippets/cloud_init_llama_${count.index + 1}.yaml"
depends_on = [null_resource.upload_cloud_init_llama]
disk {
slot = "scsi0"
type = "disk"
storage = var.storage
size = var.disk_size
} }
memory = var.worker_memory_mb[count.index]
scsihw = "virtio-scsi-pci"
boot = "order=scsi0"
bootdisk = "scsi0"
ipconfig0 = local.worker_ipconfig[count.index]
ciuser = "micqdf"
sshkeys = var.SSH_KEY_PUBLIC
disks {
scsi {
scsi0 {
disk {
size = var.worker_disk_size
storage = var.storage
}
}
}
ide {
ide2 {
cloudinit {
storage = var.storage
}
}
}
}
network { network {
id = 0 id = 0
model = "virtio" model = "virtio"
bridge = var.bridge bridge = var.bridge
} }
lifecycle {
ignore_changes = all
}
} }

View File

@@ -1,35 +1,22 @@
output "control_plane_vm_ids" { output "alpaca_vm_ids" {
value = { value = {
for i in range(var.control_plane_count) : for i in range(var.alpaca_count) :
"cp-${i + 1}" => proxmox_vm_qemu.control_planes[i].vmid "alpaca-${i + 1}" => proxmox_vm_qemu.alpacas[i].vmid
} }
} }
output "control_plane_vm_names" { output "alpaca_vm_names" {
value = [for vm in proxmox_vm_qemu.control_planes : vm.name] value = [for vm in proxmox_vm_qemu.alpacas : vm.name]
} }
output "control_plane_vm_ipv4" { output "llama_vm_ids" {
value = { value = {
for i in range(var.control_plane_count) : for i in range(var.llama_count) :
proxmox_vm_qemu.control_planes[i].name => var.control_plane_ips[i] "llama-${i + 1}" => proxmox_vm_qemu.llamas[i].vmid
} }
} }
output "worker_vm_ids" { output "llama_vm_names" {
value = { value = [for vm in proxmox_vm_qemu.llamas : vm.name]
for i in range(var.worker_count) :
"wk-${i + 1}" => proxmox_vm_qemu.workers[i].vmid
}
} }
output "worker_vm_names" {
value = [for vm in proxmox_vm_qemu.workers : vm.name]
}
output "worker_vm_ipv4" {
value = {
for i in range(var.worker_count) :
proxmox_vm_qemu.workers[i].name => var.worker_ips[i]
}
}

View File

@@ -1,25 +1,13 @@
target_node = "flex" target_node = "flex"
clone_template = "k8s-base-template" clone_template = "Alpine-TemplateV2"
bridge = "vmbr0" vm_name = "alpine-vm"
storage = "Flash" cores = 2
pm_api_url = "https://100.105.0.115:8006/api2/json" memory = 2048
pm_api_token_id = "terraform-prov@pve!mytoken" disk_size = "15G"
sockets = 1
bridge = "vmbr0"
disk_type = "scsi"
storage = "Flash"
pm_api_url = "https://100.105.0.115:8006/api2/json"
pm_user = "terraform-prov@pve"
control_plane_count = 3
worker_count = 3
control_plane_vmid_start = 701
worker_vmid_start = 711
control_plane_cores = 1
control_plane_memory_mb = 4096
control_plane_disk_size = "80G"
worker_cores = [4, 4, 4]
worker_memory_mb = [12288, 12288, 12288]
worker_disk_size = "120G"
network_prefix_length = 10
network_gateway = "10.27.27.1"
control_plane_ips = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
worker_ips = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]

View File

@@ -1,22 +1,5 @@
variable "pm_api_token_id" { variable "proxmox_password" {
type = string type = string
description = "Proxmox API token ID (format: user@realm!tokenid)"
validation {
condition = can(regex(".+!.+", trimspace(var.pm_api_token_id)))
error_message = "pm_api_token_id must be in format user@realm!tokenid."
}
}
variable "pm_api_token_secret" {
type = string
sensitive = true
description = "Proxmox API token secret"
validation {
condition = length(trimspace(var.pm_api_token_secret)) > 0
error_message = "pm_api_token_secret cannot be empty. Check your Gitea secret PM_API_TOKEN_SECRET."
}
} }
variable "target_node" { variable "target_node" {
@@ -27,104 +10,34 @@ variable "clone_template" {
type = string type = string
} }
variable "control_plane_count" { variable "vm_name" {
type = number type = string
default = 3
description = "Number of control plane VMs"
} }
variable "worker_count" { variable "cores" {
type = number type = number
default = 3
description = "Number of worker VMs"
} }
variable "control_plane_vmid_start" { variable "memory" {
type = number type = number
default = 701
description = "Starting VMID for control plane VMs"
} }
variable "worker_vmid_start" { variable "disk_size" {
type = number type = string
default = 711
description = "Starting VMID for worker VMs"
} }
variable "control_plane_cores" { variable "sockets" {
type = number type = number
default = 1
description = "vCPU cores per control plane VM"
}
variable "control_plane_memory_mb" {
type = number
default = 4096
description = "Memory in MB per control plane VM"
}
variable "worker_cores" {
type = list(number)
default = [4, 4, 4]
description = "vCPU cores for each worker VM"
}
variable "worker_memory_mb" {
type = list(number)
default = [12288, 12288, 12288]
description = "Memory in MB for each worker VM"
}
variable "control_plane_disk_size" {
type = string
default = "80G"
description = "Disk size for control plane VMs"
}
variable "worker_disk_size" {
type = string
default = "120G"
description = "Disk size for worker VMs"
}
variable "network_prefix_length" {
type = number
default = 10
description = "CIDR prefix length for static VM addresses"
}
variable "network_gateway" {
type = string
default = "10.27.27.1"
description = "Gateway for static VM addresses"
}
variable "control_plane_ips" {
type = list(string)
default = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
description = "Static IPv4 addresses for control plane VMs"
validation {
condition = length(var.control_plane_ips) == 3
error_message = "control_plane_ips must contain exactly 3 IPs."
}
}
variable "worker_ips" {
type = list(string)
default = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
description = "Static IPv4 addresses for worker VMs"
validation {
condition = length(var.worker_ips) == 3
error_message = "worker_ips must contain exactly 3 IPs."
}
} }
variable "bridge" { variable "bridge" {
type = string type = string
} }
variable "disk_type" {
type = string
}
variable "storage" { variable "storage" {
type = string type = string
} }
@@ -133,13 +46,42 @@ variable "pm_api_url" {
type = string type = string
} }
variable "qemu_agent_enabled" { variable "pm_user" {
type = bool type = string
default = false
description = "Enable QEMU guest agent integration in Proxmox resources"
} }
variable "SSH_KEY_PUBLIC" { variable "alpaca_count" {
type = string type = number
description = "Public SSH key injected via cloud-init" default = 1
description = "How many Alpaca VMs to create"
} }
variable "llama_count" {
type = number
default = 1
description = "How many Llama VMs to create"
}
variable "alpaca_vm_count" {
type = number
default = 1
description = "How many Alpaca VMs to create"
}
variable "llama_vm_count" {
type = number
default = 1
description = "How many Llama VMs to create"
}
variable "TS_AUTHKEY" {
type = string
description = "Tailscale auth key used in cloud-init"
}
variable "ssh_key" {
type = string
description = "Public SSH key used by cloud-init"
}