Compare commits
262 Commits
17834b3aa7
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bfc135350 | |||
| 63213a4bc3 | |||
| e4243c7667 | |||
| 33bb0ffb17 | |||
| 7434a65590 | |||
| cd8e538c51 | |||
| 808c290c71 | |||
| 15e6471e7e | |||
| 79a4c941e5 | |||
| e9bac70cae | |||
| 4c167f618a | |||
| 97295a7071 | |||
| 7bc861b3e8 | |||
| 6ca189b32c | |||
| b7b364a112 | |||
| 2aa9950f59 | |||
| bd866f7dac | |||
| c1f86483ad | |||
| 0cce4bcf72 | |||
| 065567210e | |||
| c5f0b1ac37 | |||
| e740d47011 | |||
| d9d3976c4c | |||
| a0b07816b9 | |||
| d964ff8b50 | |||
| e06b2c692e | |||
| c48bbddef3 | |||
| ca54c44fa4 | |||
| 8bda08be07 | |||
| 0778de9719 | |||
| 92f0658995 | |||
| fc4eb1bc6e | |||
| 4b017364c8 | |||
| a70de061b0 | |||
| 9d98f56725 | |||
| 5ddd00f711 | |||
| 5af4021228 | |||
| 034869347a | |||
| 50d0d99332 | |||
| f0093deedc | |||
| 6b6ca021c9 | |||
| c034f7975c | |||
| 90ef0ec33f | |||
| ba6cf42c04 | |||
| 3cd0c70727 | |||
| 3281ebd216 | |||
| d2dd6105a6 | |||
| 981afc509a | |||
| b3c975bd73 | |||
| 8aab666fad | |||
| 308a2fd4b7 | |||
| 3fd7ed48b1 | |||
| 0cc0de2aea | |||
| 99458ca829 | |||
| 422b7d7f23 | |||
| adc8a620f4 | |||
| 3ebeb121b4 | |||
| f11aadf79c | |||
| b4265a649e | |||
| 09d2f56967 | |||
| 9ae8eb6134 | |||
| f2b9da8a59 | |||
| a66ae788f6 | |||
| 5fa96e27d7 | |||
| cbb8358ce6 | |||
| 31017b5c3e | |||
| a16112a87a | |||
| f53d087c9c | |||
| 51b56e562e | |||
| 0e0643a6fc | |||
| 6fecfb3ee6 | |||
| 7a0016b003 | |||
| 355273add5 | |||
| e5162c220c | |||
| 262e9eb4d7 | |||
| 84513f4bb8 | |||
| c445638d4a | |||
| 678b383063 | |||
| 880bbcceca | |||
| 190dc2e095 | |||
| d86b0a32a2 | |||
| a81799a2b5 | |||
| 6c7182b8f5 | |||
| 46c0786e57 | |||
| 8b15f061bc | |||
| 1af45ca51e | |||
| c91d28a5dc | |||
| 533f5a91e0 | |||
| cfdfab3ec0 | |||
| c061dda31d | |||
| cec60c003c | |||
| fb21fbef4f | |||
| 6cc57f8b0e | |||
| 1b76e07326 | |||
| 9d17dd17cc | |||
| db72dcab75 | |||
| 23d61a6308 | |||
| d42e83358c | |||
| 198c147b79 | |||
| 93e43a546f | |||
| 3b03e68f3e | |||
| ab5cc8b01d | |||
| 92759407a6 | |||
| f65a414959 | |||
| 03c6d0454a | |||
| 7c849ed019 | |||
| b8bd9686d3 | |||
| 388b0c4f5d | |||
| cfd72fa750 | |||
| d810547675 | |||
| 3ed3381140 | |||
| 9426968cd4 | |||
| 4569fcd2ea | |||
| 02a6bca60b | |||
| f7f3c7df3e | |||
| a098c0aa29 | |||
| 766cd5db4f | |||
| 9b03cec23e | |||
| 5fe36d0963 | |||
| c794e07ab2 | |||
| 8103b02883 | |||
| fd7be1a428 | |||
| 6262f61506 | |||
| c0b820c92a | |||
| f9e7356f94 | |||
| 27185ed17a | |||
| 9baf35d886 | |||
| a5f0f0a420 | |||
| 310d273378 | |||
| 661fbc2ff4 | |||
| 3b0219f211 | |||
| 3fa227d7c9 | |||
| 61db9a26d9 | |||
| 8f915201e3 | |||
| a933341c28 | |||
| f90e971fab | |||
| 920c0c10b8 | |||
| 718a9930e8 | |||
| a9f6153623 | |||
| 9edb8f807d | |||
| 7ec1ce92cf | |||
| 198f0e2910 | |||
| 88db11292d | |||
| 8bd064c828 | |||
| 364d407fb7 | |||
| c8771b897c | |||
| 68c896d629 | |||
| 39f1e44f9b | |||
| 760d0e8b5b | |||
| e48726934f | |||
| 92a0908ff5 | |||
| 3bdf3f8d84 | |||
| 42b931668f | |||
| dad409a5b7 | |||
| 4d6ac7d9dd | |||
| 0a51dfc0e1 | |||
| 92084c3e1a | |||
| 6a77c96ad9 | |||
| 45e818b113 | |||
| 47ec65a7fd | |||
| 97795fe376 | |||
| 24c3f56399 | |||
| f5d9eba9d0 | |||
| 3e720f1d58 | |||
| 23a85cc099 | |||
| 824e3c09d1 | |||
| 327c07314c | |||
| 21425c363d | |||
| f6805f8a39 | |||
| 3b5d04dda2 | |||
| f5675d2a84 | |||
| cf98bdf229 | |||
| ba912810d1 | |||
| 727c21e43b | |||
| 70ff5ccef9 | |||
| 5c037d9a99 | |||
| 244887e9c2 | |||
| 129c639e4d | |||
| 6105a314b7 | |||
| 89bc2242cb | |||
| fce8f9c70c | |||
| c1c1b3d7f7 | |||
| cc40dff49a | |||
| 812fcb8066 | |||
| d190f64181 | |||
| 2126cf5004 | |||
| 2a5ecebd99 | |||
| 17ac3fad4c | |||
| 3ee5cfa823 | |||
| 2078afa8a3 | |||
| 2d9d6cdcd5 | |||
| 8b363497b7 | |||
| 03fff813ac | |||
| a8195f97dc | |||
| c94c1f61d8 | |||
| 7cdb0bb00b | |||
| 046de9b3d4 | |||
| b75e6b0124 | |||
| b6ce31ad6c | |||
| 6f2fa0ef06 | |||
| 71890c00c0 | |||
| f8379e6d08 | |||
| 8d809355eb | |||
| 0f171a668b | |||
| 7759c47fea | |||
| 8b83bb9d3a | |||
| 9e922dd62c | |||
| 3539ae9b50 | |||
| 5669305e59 | |||
| f341816112 | |||
| c04ef106a3 | |||
| c154ff4d15 | |||
| 8bcc162956 | |||
| b0779c51c0 | |||
| 9fe845b53d | |||
| 885a92f494 | |||
| 91dd20e60e | |||
| abac6300ca | |||
| 7206d8cd41 | |||
| a42d44bb27 | |||
| a99516a2a3 | |||
| 5c69abf9ff | |||
| 5fc8bcc406 | |||
| 16d5a87586 | |||
| 9a02c05983 | |||
| 1304afd793 | |||
| d1dcbe0feb | |||
| df4740071a | |||
| 54c0b684c8 | |||
| 2577669e12 | |||
| dd3a37dfd1 | |||
| 35f0a0dccb | |||
| 583d5c3591 | |||
| 77626ed93c | |||
| a5d5ddb618 | |||
| a5f8d72bff | |||
| 335254b7b2 | |||
| 21be01346b | |||
| ba1884bbc5 | |||
| c516c8ba35 | |||
| 8b8bab77b0 | |||
| 93bba9fbfc | |||
| 6ef807e59c | |||
| 8887a8bb87 | |||
| 32b1fcec58 | |||
| c87bb16f10 | |||
| a891109ee9 | |||
| 0ea9888854 | |||
| 3261b18f37 | |||
| 2d455929bd | |||
| 9740e9c6fb | |||
| f12e15e566 | |||
| b3521d6c02 | |||
| 017d5ce00d | |||
| a2d61d6972 | |||
| 5acb8370cc | |||
| 1a309cbe4f | |||
| 5e1fd2e9f3 | |||
| 9ce06671c9 | |||
| 5fc58dfc98 | |||
| 1c4a27bca3 | |||
| 735e9df9f1 |
181
.gitea/workflows/kubeadm-bootstrap.yml
Normal file
181
.gitea/workflows/kubeadm-bootstrap.yml
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
name: Kubeadm Bootstrap
|
||||||
|
run-name: ${{ gitea.actor }} requested kubeadm bootstrap
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
confirm:
|
||||||
|
description: "Type BOOTSTRAP to run rebuild + kubeadm bootstrap"
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: kubeadm-bootstrap
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
bootstrap:
|
||||||
|
name: "Rebuild and Bootstrap Cluster"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Validate confirmation phrase
|
||||||
|
run: |
|
||||||
|
if [ "${{ inputs.confirm }}" != "BOOTSTRAP" ]; then
|
||||||
|
echo "Confirmation failed. You must type BOOTSTRAP."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Create SSH key
|
||||||
|
run: |
|
||||||
|
install -m 0700 -d ~/.ssh
|
||||||
|
KEY_SOURCE=""
|
||||||
|
KEY_CONTENT=""
|
||||||
|
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||||
|
if [ -n "$KEY_B64" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||||
|
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||||
|
if [ -n "$KEY_CONTENT" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||||
|
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$KEY_CONTENT" ]; then
|
||||||
|
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||||
|
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||||
|
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
else
|
||||||
|
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
fi
|
||||||
|
chmod 0600 ~/.ssh/id_ed25519
|
||||||
|
|
||||||
|
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||||
|
echo "Invalid private key content from $KEY_SOURCE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Set up Terraform
|
||||||
|
uses: hashicorp/setup-terraform@v2
|
||||||
|
with:
|
||||||
|
terraform_version: 1.6.6
|
||||||
|
terraform_wrapper: false
|
||||||
|
|
||||||
|
- name: Build Terraform backend files
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
cat > secrets.auto.tfvars << EOF
|
||||||
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > backend.hcl << EOF
|
||||||
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
|
key = "terraform.tfstate"
|
||||||
|
region = "us-east-005"
|
||||||
|
endpoints = {
|
||||||
|
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||||
|
}
|
||||||
|
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||||
|
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||||
|
skip_credentials_validation = true
|
||||||
|
skip_metadata_api_check = true
|
||||||
|
skip_region_validation = true
|
||||||
|
skip_requesting_account_id = true
|
||||||
|
use_path_style = true
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Terraform init for state read
|
||||||
|
working-directory: terraform
|
||||||
|
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||||
|
|
||||||
|
- name: Create kubeadm inventory
|
||||||
|
env:
|
||||||
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
|
|
||||||
|
- name: Validate nix installation
|
||||||
|
run: |
|
||||||
|
if [ -x /nix/var/nix/profiles/default/bin/nix ]; then
|
||||||
|
/nix/var/nix/profiles/default/bin/nix --version
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v nix >/dev/null 2>&1; then
|
||||||
|
nix --version
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Nix missing; installing no-daemon Nix for this runner job"
|
||||||
|
if [ "$(id -u)" -eq 0 ]; then
|
||||||
|
mkdir -p /nix
|
||||||
|
chown root:root /nix
|
||||||
|
chmod 0755 /nix
|
||||||
|
|
||||||
|
if ! getent group nixbld >/dev/null 2>&1; then
|
||||||
|
groupadd --system nixbld
|
||||||
|
fi
|
||||||
|
|
||||||
|
for i in $(seq 1 10); do
|
||||||
|
if ! id "nixbld$i" >/dev/null 2>&1; then
|
||||||
|
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
|
||||||
|
fi
|
||||||
|
usermod -a -G nixbld "nixbld$i"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
sh <(curl -L https://nixos.org/nix/install) --no-daemon
|
||||||
|
|
||||||
|
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||||
|
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||||
|
|
||||||
|
nix --version
|
||||||
|
|
||||||
|
- name: Install nixos-rebuild tool
|
||||||
|
env:
|
||||||
|
NIX_CONFIG: experimental-features = nix-command flakes
|
||||||
|
run: |
|
||||||
|
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||||
|
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||||
|
|
||||||
|
nix profile install nixpkgs#nixos-rebuild
|
||||||
|
|
||||||
|
- name: Run cluster rebuild and bootstrap
|
||||||
|
env:
|
||||||
|
NIX_CONFIG: experimental-features = nix-command flakes
|
||||||
|
FAST_MODE: "1"
|
||||||
|
WORKER_PARALLELISM: "3"
|
||||||
|
REBUILD_TIMEOUT: "45m"
|
||||||
|
REBUILD_RETRIES: "2"
|
||||||
|
run: |
|
||||||
|
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||||
|
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
|
||||||
|
|
||||||
|
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
|
||||||
112
.gitea/workflows/kubeadm-reset.yml
Normal file
112
.gitea/workflows/kubeadm-reset.yml
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
name: Kubeadm Reset
|
||||||
|
run-name: ${{ gitea.actor }} requested kubeadm reset
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
confirm:
|
||||||
|
description: "Type RESET to run kubeadm reset on all nodes"
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: kubeadm-bootstrap
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
reset:
|
||||||
|
name: "Reset Cluster Nodes"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Validate confirmation phrase
|
||||||
|
run: |
|
||||||
|
if [ "${{ inputs.confirm }}" != "RESET" ]; then
|
||||||
|
echo "Confirmation failed. You must type RESET."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Create SSH key
|
||||||
|
run: |
|
||||||
|
install -m 0700 -d ~/.ssh
|
||||||
|
KEY_SOURCE=""
|
||||||
|
KEY_CONTENT=""
|
||||||
|
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||||
|
if [ -n "$KEY_B64" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||||
|
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||||
|
if [ -n "$KEY_CONTENT" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||||
|
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$KEY_CONTENT" ]; then
|
||||||
|
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||||
|
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||||
|
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
else
|
||||||
|
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
fi
|
||||||
|
chmod 0600 ~/.ssh/id_ed25519
|
||||||
|
|
||||||
|
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||||
|
echo "Invalid private key content from $KEY_SOURCE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Set up Terraform
|
||||||
|
uses: hashicorp/setup-terraform@v2
|
||||||
|
with:
|
||||||
|
terraform_version: 1.6.6
|
||||||
|
terraform_wrapper: false
|
||||||
|
|
||||||
|
- name: Build Terraform backend files
|
||||||
|
working-directory: terraform
|
||||||
|
run: |
|
||||||
|
cat > secrets.auto.tfvars << EOF
|
||||||
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > backend.hcl << EOF
|
||||||
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
|
key = "terraform.tfstate"
|
||||||
|
region = "us-east-005"
|
||||||
|
endpoints = {
|
||||||
|
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||||
|
}
|
||||||
|
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||||
|
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||||
|
skip_credentials_validation = true
|
||||||
|
skip_metadata_api_check = true
|
||||||
|
skip_region_validation = true
|
||||||
|
skip_requesting_account_id = true
|
||||||
|
use_path_style = true
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Terraform init for state read
|
||||||
|
working-directory: terraform
|
||||||
|
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||||
|
|
||||||
|
- name: Create kubeadm inventory
|
||||||
|
env:
|
||||||
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
|
|
||||||
|
- name: Run cluster reset
|
||||||
|
run: |
|
||||||
|
./nixos/kubeadm/scripts/reset-cluster-nodes.sh
|
||||||
@@ -23,7 +23,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
SSH_KEY_PUBLIC = "${{ secrets.SSH_KEY_PUBLIC }}"
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
@@ -45,6 +45,7 @@ jobs:
|
|||||||
uses: hashicorp/setup-terraform@v2
|
uses: hashicorp/setup-terraform@v2
|
||||||
with:
|
with:
|
||||||
terraform_version: 1.6.6
|
terraform_version: 1.6.6
|
||||||
|
terraform_wrapper: false
|
||||||
|
|
||||||
- name: Terraform Init
|
- name: Terraform Init
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
@@ -52,7 +53,20 @@ jobs:
|
|||||||
|
|
||||||
- name: Terraform Plan
|
- name: Terraform Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform plan -out=tfplan
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform plan attempt $attempt/2"
|
||||||
|
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "Terraform plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Block accidental destroy
|
- name: Block accidental destroy
|
||||||
env:
|
env:
|
||||||
@@ -69,168 +83,127 @@ jobs:
|
|||||||
|
|
||||||
- name: Terraform Apply
|
- name: Terraform Apply
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform apply -auto-approve tfplan
|
run: terraform apply -parallelism=1 -auto-approve tfplan
|
||||||
|
|
||||||
- name: Enroll VMs in Tailscale
|
- name: Create SSH key
|
||||||
env:
|
|
||||||
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
|
|
||||||
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
|
|
||||||
TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
|
|
||||||
working-directory: terraform
|
|
||||||
run: |
|
run: |
|
||||||
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
|
install -m 0700 -d ~/.ssh
|
||||||
echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)."
|
KEY_SOURCE=""
|
||||||
exit 0
|
KEY_CONTENT=""
|
||||||
|
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||||
|
if [ -n "$KEY_B64" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||||
|
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||||
|
if [ -n "$KEY_CONTENT" ]; then
|
||||||
|
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||||
|
else
|
||||||
|
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||||
|
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars)
|
if [ -z "$KEY_CONTENT" ]; then
|
||||||
PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars)
|
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||||
TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars)
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
export PM_API_URL PM_API_TOKEN_ID TARGET_NODE
|
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||||
|
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||||
|
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
else
|
||||||
|
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||||
|
fi
|
||||||
|
chmod 0600 ~/.ssh/id_ed25519
|
||||||
|
|
||||||
terraform output -json > tfoutputs.json
|
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||||
cat > enroll_tailscale.py <<'PY'
|
echo "Invalid private key content from $KEY_SOURCE"
|
||||||
import json
|
exit 1
|
||||||
import os
|
fi
|
||||||
import ssl
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import urllib.parse
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
api_url = os.environ["PM_API_URL"].rstrip("/")
|
- name: Verify SSH keypair match
|
||||||
if api_url.endswith("/api2/json"):
|
run: |
|
||||||
api_url = api_url[: -len("/api2/json")]
|
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/tmp/key.pub 2>/tmp/key.err; then
|
||||||
token_id = os.environ["PM_API_TOKEN_ID"].strip()
|
echo "Invalid private key content in SSH_KEY_PRIVATE/KUBEADM_SSH_PRIVATE_KEY"
|
||||||
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
|
cat /tmp/key.err
|
||||||
target_node = os.environ["TARGET_NODE"].strip()
|
exit 1
|
||||||
ts_authkey = os.environ["TS_AUTHKEY"]
|
fi
|
||||||
enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
|
|
||||||
|
|
||||||
if not token_id or not token_secret:
|
printf '%s\n' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r' > /tmp/secret.pub
|
||||||
raise SystemExit("Missing Proxmox token id/secret")
|
if ! ssh-keygen -lf /tmp/secret.pub >/tmp/secret.fp 2>/tmp/secret.err; then
|
||||||
|
echo "Invalid SSH_KEY_PUBLIC format"
|
||||||
|
cat /tmp/secret.err
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
|
PRIV_FP="$(ssh-keygen -lf /tmp/key.pub | awk '{print $2}')"
|
||||||
start = raw_outputs.find("{")
|
PUB_FP="$(awk '{print $2}' /tmp/secret.fp)"
|
||||||
if start == -1:
|
|
||||||
raise SystemExit("Could not find JSON payload in terraform output")
|
|
||||||
outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
|
|
||||||
|
|
||||||
targets = []
|
echo "private fingerprint: $PRIV_FP"
|
||||||
for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
|
echo "public fingerprint: $PUB_FP"
|
||||||
mapping = outputs.get(output_name, {}).get("value", {})
|
|
||||||
if isinstance(mapping, dict):
|
|
||||||
for hostname, vmid in mapping.items():
|
|
||||||
targets.append((str(hostname), int(vmid)))
|
|
||||||
|
|
||||||
if not targets:
|
if [ "$PRIV_FP" != "$PUB_FP" ]; then
|
||||||
print("No VMs found in terraform outputs; skipping tailscale enrollment")
|
echo "SSH_KEY_PRIVATE does not match SSH_KEY_PUBLIC. Update secrets with the same keypair."
|
||||||
raise SystemExit(0)
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets))
|
- name: Create kubeadm inventory from Terraform outputs
|
||||||
|
env:
|
||||||
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
|
|
||||||
ssl_ctx = ssl._create_unverified_context()
|
- name: Ensure nix and nixos-rebuild
|
||||||
auth_header = f"PVEAPIToken={token_id}={token_secret}"
|
env:
|
||||||
|
NIX_CONFIG: experimental-features = nix-command flakes
|
||||||
|
run: |
|
||||||
|
if [ ! -x /nix/var/nix/profiles/default/bin/nix ] && ! command -v nix >/dev/null 2>&1; then
|
||||||
|
if [ "$(id -u)" -eq 0 ]; then
|
||||||
|
mkdir -p /nix
|
||||||
|
chown root:root /nix
|
||||||
|
chmod 0755 /nix
|
||||||
|
|
||||||
def api_request(method, path, data=None):
|
if ! getent group nixbld >/dev/null 2>&1; then
|
||||||
url = f"{api_url}{path}"
|
groupadd --system nixbld
|
||||||
headers = {"Authorization": auth_header}
|
fi
|
||||||
body = None
|
|
||||||
if data is not None:
|
|
||||||
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
|
|
||||||
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
||||||
req = urllib.request.Request(url, data=body, headers=headers, method=method)
|
|
||||||
with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp:
|
|
||||||
payload = resp.read().decode("utf-8")
|
|
||||||
return json.loads(payload)
|
|
||||||
|
|
||||||
def wait_for_guest_agent(vmid, timeout_seconds=300):
|
for i in $(seq 1 10); do
|
||||||
deadline = time.time() + timeout_seconds
|
if ! id "nixbld$i" >/dev/null 2>&1; then
|
||||||
tries = 0
|
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
|
||||||
while time.time() < deadline:
|
fi
|
||||||
tries += 1
|
usermod -a -G nixbld "nixbld$i"
|
||||||
try:
|
done
|
||||||
res = api_request("GET", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping")
|
fi
|
||||||
if res.get("data") == "pong":
|
sh <(curl -L https://nixos.org/nix/install) --no-daemon
|
||||||
print(f"Guest agent ready for vmid {vmid}", flush=True)
|
fi
|
||||||
return True
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if tries % 6 == 0:
|
|
||||||
remaining = int(deadline - time.time())
|
|
||||||
print(f"Waiting for guest agent on vmid {vmid} ({remaining}s left)", flush=True)
|
|
||||||
time.sleep(5)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def exec_guest(vmid, command):
|
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
res = api_request(
|
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||||
"POST",
|
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec",
|
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||||
{
|
fi
|
||||||
"command": "/run/current-system/sw/bin/sh",
|
|
||||||
"extra-args": ["-lc", command],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
pid = res["data"]["pid"]
|
|
||||||
for _ in range(120):
|
|
||||||
status = api_request(
|
|
||||||
"GET",
|
|
||||||
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}",
|
|
||||||
).get("data", {})
|
|
||||||
if status.get("exited"):
|
|
||||||
return (
|
|
||||||
int(status.get("exitcode", 1)),
|
|
||||||
status.get("out-data", ""),
|
|
||||||
status.get("err-data", ""),
|
|
||||||
)
|
|
||||||
time.sleep(2)
|
|
||||||
return (124, "", "Timed out waiting for guest command")
|
|
||||||
|
|
||||||
failures = []
|
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||||
safe_key = ts_authkey.replace("'", "'\"'\"'")
|
|
||||||
|
|
||||||
for hostname, vmid in targets:
|
nix --version
|
||||||
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
|
nix profile install nixpkgs#nixos-rebuild
|
||||||
if not wait_for_guest_agent(vmid):
|
|
||||||
failures.append((hostname, "agent_not_ready", "guest agent not ready"))
|
|
||||||
print(f"ERROR: guest agent not ready for vmid {vmid}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
safe_hostname = hostname.replace("'", "'\"'\"'")
|
- name: Rebuild and bootstrap/reconcile kubeadm cluster
|
||||||
cmd = (
|
env:
|
||||||
"set -e; "
|
NIX_CONFIG: experimental-features = nix-command flakes
|
||||||
f"hostnamectl set-hostname '{safe_hostname}' || true; "
|
FAST_MODE: "1"
|
||||||
"install -d -m 700 /var/lib/tailscale; "
|
WORKER_PARALLELISM: "3"
|
||||||
"rm -f /var/lib/tailscale/tailscaled.state; "
|
REBUILD_TIMEOUT: "45m"
|
||||||
"systemctl restart tailscaled; "
|
REBUILD_RETRIES: "2"
|
||||||
f"/run/current-system/sw/bin/tailscale up --reset --auth-key='{safe_key}' --hostname='{safe_hostname}'; "
|
run: |
|
||||||
"/run/current-system/sw/bin/tailscale status || true"
|
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
)
|
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||||
|
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||||
|
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||||
|
fi
|
||||||
|
|
||||||
exitcode, stdout, stderr = exec_guest(vmid, cmd)
|
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
|
||||||
if stdout:
|
|
||||||
print(stdout)
|
|
||||||
if stderr:
|
|
||||||
print(stderr, file=sys.stderr)
|
|
||||||
|
|
||||||
if exitcode != 0:
|
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
|
||||||
failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
|
|
||||||
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
|
|
||||||
|
|
||||||
if failures:
|
|
||||||
print("\nEnrollment failures:")
|
|
||||||
for hostname, kind, detail in failures:
|
|
||||||
print(f"- {hostname}: {detail}")
|
|
||||||
|
|
||||||
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
|
|
||||||
if only_agent_ready_failures and not enroll_strict:
|
|
||||||
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
|
|
||||||
raise SystemExit(0)
|
|
||||||
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
print("\nTailscale enrollment completed for all managed VMs")
|
|
||||||
PY
|
|
||||||
|
|
||||||
python3 -u enroll_tailscale.py
|
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ on:
|
|||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- all
|
- all
|
||||||
- alpacas
|
- control-planes
|
||||||
- llamas
|
- workers
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: terraform-global
|
group: terraform-global
|
||||||
@@ -43,6 +43,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
@@ -64,6 +65,7 @@ jobs:
|
|||||||
uses: hashicorp/setup-terraform@v2
|
uses: hashicorp/setup-terraform@v2
|
||||||
with:
|
with:
|
||||||
terraform_version: 1.6.6
|
terraform_version: 1.6.6
|
||||||
|
terraform_wrapper: false
|
||||||
|
|
||||||
- name: Terraform Init
|
- name: Terraform Init
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
@@ -72,15 +74,16 @@ jobs:
|
|||||||
- name: Terraform Destroy Plan
|
- name: Terraform Destroy Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
case "${{ inputs.target }}" in
|
case "${{ inputs.target }}" in
|
||||||
all)
|
all)
|
||||||
terraform plan -destroy -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
alpacas)
|
control-planes)
|
||||||
terraform plan -destroy -target=proxmox_vm_qemu.alpacas -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
llamas)
|
workers)
|
||||||
terraform plan -destroy -target=proxmox_vm_qemu.llamas -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Invalid destroy target: ${{ inputs.target }}"
|
echo "Invalid destroy target: ${{ inputs.target }}"
|
||||||
@@ -88,6 +91,36 @@ jobs:
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform destroy plan attempt $attempt/2"
|
||||||
|
if timeout 20m sh -c "$TF_PLAN_CMD"; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Destroy plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Terraform destroy plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Terraform Destroy Apply
|
- name: Terraform Destroy Apply
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform apply -auto-approve tfdestroy
|
run: |
|
||||||
|
set +e
|
||||||
|
terraform apply -auto-approve tfdestroy 2>&1 | tee destroy-apply.log
|
||||||
|
APPLY_EXIT=${PIPESTATUS[0]}
|
||||||
|
|
||||||
|
if [ "$APPLY_EXIT" -ne 0 ] && [ -f errored.tfstate ] && grep -q "Failed to persist state to backend" destroy-apply.log; then
|
||||||
|
echo "Detected backend state write failure after destroy; attempting recovery push..."
|
||||||
|
terraform state push errored.tfstate
|
||||||
|
PUSH_EXIT=$?
|
||||||
|
|
||||||
|
if [ "$PUSH_EXIT" -eq 0 ]; then
|
||||||
|
echo "Recovered by pushing errored.tfstate to backend."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit "$APPLY_EXIT"
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ jobs:
|
|||||||
echo "PM_API_TOKEN_SECRET length: $(echo -n '${{ secrets.PM_API_TOKEN_SECRET }}' | wc -c)"
|
echo "PM_API_TOKEN_SECRET length: $(echo -n '${{ secrets.PM_API_TOKEN_SECRET }}' | wc -c)"
|
||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
SSH_KEY_PUBLIC = "${{ secrets.SSH_KEY_PUBLIC }}"
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
@@ -51,6 +51,7 @@ jobs:
|
|||||||
uses: hashicorp/setup-terraform@v2
|
uses: hashicorp/setup-terraform@v2
|
||||||
with:
|
with:
|
||||||
terraform_version: 1.6.6
|
terraform_version: 1.6.6
|
||||||
|
terraform_wrapper: false
|
||||||
|
|
||||||
- name: Terraform Init
|
- name: Terraform Init
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
@@ -66,7 +67,20 @@ jobs:
|
|||||||
|
|
||||||
- name: Terraform Plan
|
- name: Terraform Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform plan -out=tfplan
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform plan attempt $attempt/2"
|
||||||
|
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "Terraform plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Block accidental destroy
|
- name: Block accidental destroy
|
||||||
env:
|
env:
|
||||||
@@ -81,8 +95,7 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Upload Terraform Plan
|
# NOTE: Disabled artifact upload for now.
|
||||||
uses: actions/upload-artifact@v3
|
# On this Gitea/act runner, post-job hooks from artifact actions can
|
||||||
with:
|
# fail during "Complete job" even when all Terraform steps succeeded.
|
||||||
name: terraform-plan
|
# Re-enable once runner/action compatibility is confirmed.
|
||||||
path: terraform/tfplan
|
|
||||||
|
|||||||
169
nixos/kubeadm/README.md
Normal file
169
nixos/kubeadm/README.md
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# Kubeadm Cluster Layout (NixOS)
|
||||||
|
|
||||||
|
This folder defines role-based NixOS configs for a kubeadm cluster.
|
||||||
|
|
||||||
|
## Topology
|
||||||
|
|
||||||
|
- Control planes: `cp-1`, `cp-2`, `cp-3`
|
||||||
|
- Workers: `wk-1`, `wk-2`, `wk-3`
|
||||||
|
|
||||||
|
## What this provides
|
||||||
|
|
||||||
|
- Shared Kubernetes/node prerequisites in `modules/k8s-common.nix`
|
||||||
|
- Shared cluster defaults in `modules/k8s-cluster-settings.nix`
|
||||||
|
- Role-specific settings for control planes and workers
|
||||||
|
- Generated per-node host configs from `flake.nix` (no duplicated host files)
|
||||||
|
- Bootstrap helper commands on each node:
|
||||||
|
- `th-kubeadm-init`
|
||||||
|
- `th-kubeadm-join-control-plane`
|
||||||
|
- `th-kubeadm-join-worker`
|
||||||
|
- `th-kubeadm-status`
|
||||||
|
- A Python bootstrap controller for orchestration:
|
||||||
|
- `bootstrap/controller.py`
|
||||||
|
|
||||||
|
## Layered architecture
|
||||||
|
|
||||||
|
- `terraform/`: VM lifecycle only
|
||||||
|
- `nixos/kubeadm/modules/`: declarative node OS config only
|
||||||
|
- `nixos/kubeadm/bootstrap/controller.py`: imperative cluster reconciliation state machine
|
||||||
|
|
||||||
|
## Hardware config files
|
||||||
|
|
||||||
|
The flake automatically imports `hosts/hardware/<host>.nix` if present.
|
||||||
|
Copy each node's generated hardware config into this folder:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nixos-generate-config
|
||||||
|
sudo cp /etc/nixos/hardware-configuration.nix ./hosts/hardware/cp-1.nix
|
||||||
|
```
|
||||||
|
|
||||||
|
Repeat for each node (`cp-2`, `cp-3`, `wk-1`, `wk-2`, `wk-3`).
|
||||||
|
|
||||||
|
## Deploy approach
|
||||||
|
|
||||||
|
Start from one node at a time while experimenting:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nixos-rebuild switch --flake .#cp-1
|
||||||
|
```
|
||||||
|
|
||||||
|
For remote target-host workflows, use your preferred deploy wrapper later
|
||||||
|
(`nixos-rebuild --target-host ...` or deploy-rs/colmena).
|
||||||
|
|
||||||
|
## Bootstrap runbook (kubeadm + kube-vip + Flannel)
|
||||||
|
|
||||||
|
1. Apply Nix config on all nodes (`cp-*`, then `wk-*`).
|
||||||
|
2. On `cp-1`, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo th-kubeadm-init
|
||||||
|
```
|
||||||
|
|
||||||
|
This infers the control-plane VIP as `<node-subnet>.250` on `eth0`, creates the
|
||||||
|
kube-vip static pod manifest, and runs `kubeadm init`.
|
||||||
|
|
||||||
|
3. Install Flannel from `cp-1`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Generate join commands on `cp-1`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo kubeadm token create --print-join-command
|
||||||
|
sudo kubeadm init phase upload-certs --upload-certs
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Join `cp-2` and `cp-3`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Join workers:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo th-kubeadm-join-worker '<kubeadm join ...>'
|
||||||
|
```
|
||||||
|
|
||||||
|
7. Validate from a control plane:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl get nodes -o wide
|
||||||
|
kubectl -n kube-system get pods -o wide
|
||||||
|
```
|
||||||
|
|
||||||
|
## Fresh bootstrap flow (recommended)
|
||||||
|
|
||||||
|
1. Copy and edit inventory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp ./scripts/inventory.example.env ./scripts/inventory.env
|
||||||
|
$EDITOR ./scripts/inventory.env
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Rebuild all nodes and bootstrap a fresh cluster:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/rebuild-and-bootstrap.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional tuning env vars:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
FAST_MODE=1 WORKER_PARALLELISM=3 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
- `FAST_MODE=1` skips pre-rebuild remote GC cleanup to reduce wall-clock time.
|
||||||
|
- Set `FAST_MODE=0` for a slower but more aggressive space cleanup pass.
|
||||||
|
|
||||||
|
### Bootstrap controller state
|
||||||
|
|
||||||
|
The controller stores checkpoints in both places:
|
||||||
|
|
||||||
|
- Remote (source of truth): `/var/lib/terrahome/bootstrap-state.json` on `cp-1`
|
||||||
|
- Local copy (workflow/debug artifact): `nixos/kubeadm/bootstrap/bootstrap-state-last.json`
|
||||||
|
|
||||||
|
This makes retries resumable and keeps failure context visible from CI.
|
||||||
|
|
||||||
|
3. If you only want to reset Kubernetes state on existing VMs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/reset-cluster-nodes.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
For a full nuke/recreate lifecycle:
|
||||||
|
- run Terraform destroy/apply for VMs first,
|
||||||
|
- then run `./scripts/rebuild-and-bootstrap.sh` again.
|
||||||
|
|
||||||
|
Node lists now come directly from static Terraform outputs, so bootstrap no longer
|
||||||
|
depends on Proxmox guest-agent IP discovery or SSH subnet scanning.
|
||||||
|
|
||||||
|
## Optional Gitea workflow automation
|
||||||
|
|
||||||
|
Primary flow:
|
||||||
|
|
||||||
|
- Push to `master` triggers `.gitea/workflows/terraform-apply.yml`
|
||||||
|
- That workflow now does Terraform apply and then runs a fresh kubeadm bootstrap automatically
|
||||||
|
|
||||||
|
Manual dispatch workflows are available:
|
||||||
|
|
||||||
|
- `.gitea/workflows/kubeadm-bootstrap.yml`
|
||||||
|
- `.gitea/workflows/kubeadm-reset.yml`
|
||||||
|
|
||||||
|
Required repository secrets:
|
||||||
|
|
||||||
|
- Existing Terraform/backend secrets used by current workflows (`B2_*`, `PM_API_TOKEN_SECRET`, `SSH_KEY_PUBLIC`)
|
||||||
|
- SSH private key: prefer `KUBEADM_SSH_PRIVATE_KEY`, fallback to existing `SSH_KEY_PRIVATE`
|
||||||
|
|
||||||
|
Optional secrets:
|
||||||
|
|
||||||
|
- `KUBEADM_SSH_USER` (defaults to `micqdf`)
|
||||||
|
Node IPs are rendered directly from static Terraform outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets or SSH discovery fallbacks.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Scripts are intentionally manual-triggered (predictable for homelab bring-up).
|
||||||
|
- If `.250` on the node subnet is already in use, change `controlPlaneVipSuffix`
|
||||||
|
in `modules/k8s-cluster-settings.nix` before bootstrap.
|
||||||
447
nixos/kubeadm/bootstrap/controller.py
Executable file
447
nixos/kubeadm/bootstrap/controller.py
Executable file
@@ -0,0 +1,447 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shlex
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def run_local(cmd, check=True, capture=False):
|
||||||
|
if isinstance(cmd, str):
|
||||||
|
shell = True
|
||||||
|
else:
|
||||||
|
shell = False
|
||||||
|
return subprocess.run(
|
||||||
|
cmd,
|
||||||
|
shell=shell,
|
||||||
|
check=check,
|
||||||
|
text=True,
|
||||||
|
capture_output=capture,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_inventory(inventory_file):
|
||||||
|
inventory_file = Path(inventory_file).resolve()
|
||||||
|
if not inventory_file.exists():
|
||||||
|
raise RuntimeError(f"Missing inventory file: {inventory_file}")
|
||||||
|
cmd = (
|
||||||
|
"set -a; "
|
||||||
|
f"source {shlex.quote(str(inventory_file))}; "
|
||||||
|
"python3 - <<'PY'\n"
|
||||||
|
"import json, os\n"
|
||||||
|
"print(json.dumps(dict(os.environ)))\n"
|
||||||
|
"PY"
|
||||||
|
)
|
||||||
|
proc = run_local(["bash", "-lc", cmd], capture=True)
|
||||||
|
env = json.loads(proc.stdout)
|
||||||
|
|
||||||
|
node_ips = {}
|
||||||
|
cp_names = []
|
||||||
|
wk_names = []
|
||||||
|
|
||||||
|
control_planes = env.get("CONTROL_PLANES", "").strip()
|
||||||
|
workers = env.get("WORKERS", "").strip()
|
||||||
|
|
||||||
|
if control_planes:
|
||||||
|
for pair in control_planes.split():
|
||||||
|
name, ip = pair.split("=", 1)
|
||||||
|
node_ips[name] = ip
|
||||||
|
cp_names.append(name)
|
||||||
|
else:
|
||||||
|
for key in sorted(k for k in env if k.startswith("CP_") and k[3:].isdigit()):
|
||||||
|
idx = key.split("_", 1)[1]
|
||||||
|
name = f"cp-{idx}"
|
||||||
|
node_ips[name] = env[key]
|
||||||
|
cp_names.append(name)
|
||||||
|
|
||||||
|
if workers:
|
||||||
|
for pair in workers.split():
|
||||||
|
name, ip = pair.split("=", 1)
|
||||||
|
node_ips[name] = ip
|
||||||
|
wk_names.append(name)
|
||||||
|
else:
|
||||||
|
for key in sorted(k for k in env if k.startswith("WK_") and k[3:].isdigit()):
|
||||||
|
idx = key.split("_", 1)[1]
|
||||||
|
name = f"wk-{idx}"
|
||||||
|
node_ips[name] = env[key]
|
||||||
|
wk_names.append(name)
|
||||||
|
|
||||||
|
if not cp_names or not wk_names:
|
||||||
|
raise RuntimeError("Inventory must include control planes and workers")
|
||||||
|
|
||||||
|
primary_cp = env.get("PRIMARY_CONTROL_PLANE", "cp-1")
|
||||||
|
if primary_cp not in node_ips:
|
||||||
|
primary_cp = cp_names[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"env": env,
|
||||||
|
"node_ips": node_ips,
|
||||||
|
"cp_names": cp_names,
|
||||||
|
"wk_names": wk_names,
|
||||||
|
"primary_cp": primary_cp,
|
||||||
|
"inventory_file": str(inventory_file),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Controller:
|
||||||
|
def __init__(self, cfg):
|
||||||
|
self.env = cfg["env"]
|
||||||
|
self.node_ips = cfg["node_ips"]
|
||||||
|
self.cp_names = cfg["cp_names"]
|
||||||
|
self.wk_names = cfg["wk_names"]
|
||||||
|
self.primary_cp = cfg["primary_cp"]
|
||||||
|
self.primary_ip = self.node_ips[self.primary_cp]
|
||||||
|
|
||||||
|
self.script_dir = Path(__file__).resolve().parent
|
||||||
|
self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve()
|
||||||
|
|
||||||
|
self.ssh_user = self.env.get("SSH_USER", "micqdf")
|
||||||
|
self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split()
|
||||||
|
self.active_ssh_user = self.ssh_user
|
||||||
|
self.ssh_key = self.env.get("SSH_KEY_PATH", str(Path.home() / ".ssh" / "id_ed25519"))
|
||||||
|
self.ssh_opts = [
|
||||||
|
"-o",
|
||||||
|
"BatchMode=yes",
|
||||||
|
"-o",
|
||||||
|
"IdentitiesOnly=yes",
|
||||||
|
"-o",
|
||||||
|
"StrictHostKeyChecking=no",
|
||||||
|
"-o",
|
||||||
|
"UserKnownHostsFile=/dev/null",
|
||||||
|
"-i",
|
||||||
|
self.ssh_key,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.rebuild_timeout = self.env.get("REBUILD_TIMEOUT", "45m")
|
||||||
|
self.rebuild_retries = int(self.env.get("REBUILD_RETRIES", "2"))
|
||||||
|
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
|
||||||
|
self.fast_mode = self.env.get("FAST_MODE", "1")
|
||||||
|
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
||||||
|
self.force_reinit = True
|
||||||
|
self.ssh_ready_retries = int(self.env.get("SSH_READY_RETRIES", "20"))
|
||||||
|
self.ssh_ready_delay = int(self.env.get("SSH_READY_DELAY_SEC", "15"))
|
||||||
|
|
||||||
|
def log(self, msg):
|
||||||
|
print(f"==> {msg}")
|
||||||
|
|
||||||
|
def _ssh(self, user, ip, cmd, check=True):
|
||||||
|
full = ["ssh", *self.ssh_opts, f"{user}@{ip}", f"bash -lc {shlex.quote(cmd)}"]
|
||||||
|
return run_local(full, check=check, capture=True)
|
||||||
|
|
||||||
|
def detect_user(self, ip):
|
||||||
|
for attempt in range(1, self.ssh_ready_retries + 1):
|
||||||
|
for user in self.ssh_candidates:
|
||||||
|
proc = self._ssh(user, ip, "true", check=False)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
self.active_ssh_user = user
|
||||||
|
self.log(f"Using SSH user '{user}' for {ip}")
|
||||||
|
return
|
||||||
|
if attempt < self.ssh_ready_retries:
|
||||||
|
self.log(
|
||||||
|
f"SSH not ready on {ip} yet; retrying in {self.ssh_ready_delay}s "
|
||||||
|
f"({attempt}/{self.ssh_ready_retries})"
|
||||||
|
)
|
||||||
|
time.sleep(self.ssh_ready_delay)
|
||||||
|
raise RuntimeError(
|
||||||
|
"Unable to authenticate to "
|
||||||
|
f"{ip} with users: {', '.join(self.ssh_candidates)}. "
|
||||||
|
"If this is a freshly cloned VM, the Proxmox source template likely does not yet include the "
|
||||||
|
"current cloud-init-capable NixOS template configuration from nixos/template-base. "
|
||||||
|
"Terraform can only clone what exists in Proxmox; it cannot retrofit cloud-init support into an old template."
|
||||||
|
)
|
||||||
|
|
||||||
|
def remote(self, ip, cmd, check=True):
|
||||||
|
ordered = [self.active_ssh_user] + [u for u in self.ssh_candidates if u != self.active_ssh_user]
|
||||||
|
last = None
|
||||||
|
for user in ordered:
|
||||||
|
proc = self._ssh(user, ip, cmd, check=False)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
self.active_ssh_user = user
|
||||||
|
return proc
|
||||||
|
if proc.returncode != 255:
|
||||||
|
last = proc
|
||||||
|
break
|
||||||
|
last = proc
|
||||||
|
if check:
|
||||||
|
stdout = (last.stdout or "").strip()
|
||||||
|
stderr = (last.stderr or "").strip()
|
||||||
|
raise RuntimeError(f"Remote command failed on {ip}: {cmd}\n{stdout}\n{stderr}")
|
||||||
|
return last
|
||||||
|
|
||||||
|
def prepare_known_hosts(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def prepare_remote_nix(self, ip):
|
||||||
|
self.remote(ip, "sudo mkdir -p /etc/nix")
|
||||||
|
self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi")
|
||||||
|
self.remote(ip, "echo 'trusted-users = root micqdf' | sudo tee -a /etc/nix/nix.conf >/dev/null")
|
||||||
|
self.remote(ip, "sudo systemctl restart nix-daemon 2>/dev/null || true")
|
||||||
|
|
||||||
|
def prepare_remote_kubelet(self, ip):
|
||||||
|
self.remote(ip, "sudo systemctl stop kubelet >/dev/null 2>&1 || true")
|
||||||
|
self.remote(ip, "sudo systemctl disable kubelet >/dev/null 2>&1 || true")
|
||||||
|
self.remote(ip, "sudo systemctl mask kubelet >/dev/null 2>&1 || true")
|
||||||
|
self.remote(ip, "sudo systemctl reset-failed kubelet >/dev/null 2>&1 || true")
|
||||||
|
self.remote(ip, "sudo rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env || true")
|
||||||
|
|
||||||
|
def prepare_remote_space(self, ip):
|
||||||
|
self.remote(ip, "sudo nix-collect-garbage -d || true")
|
||||||
|
self.remote(ip, "sudo nix --extra-experimental-features nix-command store gc || true")
|
||||||
|
self.remote(ip, "sudo rm -rf /tmp/nix* /tmp/nixos-rebuild* || true")
|
||||||
|
|
||||||
|
def rebuild_node_once(self, name, ip):
|
||||||
|
self.detect_user(ip)
|
||||||
|
cmd = [
|
||||||
|
"timeout",
|
||||||
|
self.rebuild_timeout,
|
||||||
|
"nixos-rebuild",
|
||||||
|
"switch",
|
||||||
|
"--flake",
|
||||||
|
f"{self.flake_dir}#{name}",
|
||||||
|
"--target-host",
|
||||||
|
f"{self.active_ssh_user}@{ip}",
|
||||||
|
"--use-remote-sudo",
|
||||||
|
]
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["NIX_SSHOPTS"] = " ".join(self.ssh_opts)
|
||||||
|
proc = subprocess.run(cmd, text=True, env=env)
|
||||||
|
return proc.returncode == 0
|
||||||
|
|
||||||
|
def rebuild_with_retry(self, name, ip):
|
||||||
|
max_attempts = self.rebuild_retries + 1
|
||||||
|
for attempt in range(1, max_attempts + 1):
|
||||||
|
self.log(f"Rebuild attempt {attempt}/{max_attempts} for {name}")
|
||||||
|
if self.rebuild_node_once(name, ip):
|
||||||
|
return
|
||||||
|
if attempt < max_attempts:
|
||||||
|
self.log(f"Rebuild failed for {name}, retrying in 20s")
|
||||||
|
time.sleep(20)
|
||||||
|
raise RuntimeError(f"Rebuild failed permanently for {name}")
|
||||||
|
|
||||||
|
def stage_preflight(self):
|
||||||
|
self.prepare_known_hosts()
|
||||||
|
self.detect_user(self.primary_ip)
|
||||||
|
|
||||||
|
def stage_rebuild(self):
|
||||||
|
if self.skip_rebuild:
|
||||||
|
self.log("Node rebuild already complete")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.detect_user(self.primary_ip)
|
||||||
|
for name in self.cp_names:
|
||||||
|
ip = self.node_ips[name]
|
||||||
|
self.log(f"Preparing and rebuilding {name} ({ip})")
|
||||||
|
self.prepare_remote_nix(ip)
|
||||||
|
self.prepare_remote_kubelet(ip)
|
||||||
|
if self.fast_mode != "1":
|
||||||
|
self.prepare_remote_space(ip)
|
||||||
|
self.rebuild_with_retry(name, ip)
|
||||||
|
|
||||||
|
for name in self.wk_names:
|
||||||
|
ip = self.node_ips[name]
|
||||||
|
self.log(f"Preparing {name} ({ip})")
|
||||||
|
self.prepare_remote_nix(ip)
|
||||||
|
self.prepare_remote_kubelet(ip)
|
||||||
|
if self.fast_mode != "1":
|
||||||
|
self.prepare_remote_space(ip)
|
||||||
|
|
||||||
|
failures = []
|
||||||
|
with ThreadPoolExecutor(max_workers=self.worker_parallelism) as pool:
|
||||||
|
futures = {pool.submit(self.rebuild_with_retry, name, self.node_ips[name]): name for name in self.wk_names}
|
||||||
|
for fut in as_completed(futures):
|
||||||
|
name = futures[fut]
|
||||||
|
try:
|
||||||
|
fut.result()
|
||||||
|
except Exception as exc:
|
||||||
|
failures.append((name, str(exc)))
|
||||||
|
if failures:
|
||||||
|
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
||||||
|
|
||||||
|
def has_admin_conf(self):
|
||||||
|
return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0
|
||||||
|
|
||||||
|
def cluster_ready(self):
|
||||||
|
cmd = "sudo test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get --raw=/readyz >/dev/null 2>&1"
|
||||||
|
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||||
|
|
||||||
|
def stage_init_primary(self):
|
||||||
|
self.log(f"Initializing primary control plane on {self.primary_cp}")
|
||||||
|
self.remote(self.primary_ip, "sudo th-kubeadm-init")
|
||||||
|
|
||||||
|
def stage_install_cni(self):
|
||||||
|
self.log("Installing Flannel")
|
||||||
|
manifest_path = self.script_dir.parent / "manifests" / "kube-flannel.yml"
|
||||||
|
manifest_b64 = base64.b64encode(manifest_path.read_bytes()).decode()
|
||||||
|
|
||||||
|
self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
(
|
||||||
|
"sudo mkdir -p /var/lib/terrahome && "
|
||||||
|
f"echo {shlex.quote(manifest_b64)} | base64 -d | sudo tee /var/lib/terrahome/kube-flannel.yml >/dev/null"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.log("Waiting for API readiness before applying Flannel")
|
||||||
|
ready = False
|
||||||
|
for _ in range(30):
|
||||||
|
if self.cluster_ready():
|
||||||
|
ready = True
|
||||||
|
break
|
||||||
|
time.sleep(10)
|
||||||
|
if not ready:
|
||||||
|
raise RuntimeError("API server did not become ready before Flannel install")
|
||||||
|
|
||||||
|
last_error = None
|
||||||
|
for attempt in range(1, 6):
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f /var/lib/terrahome/kube-flannel.yml",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
return
|
||||||
|
last_error = (proc.stdout or "") + ("\n" if proc.stdout and proc.stderr else "") + (proc.stderr or "")
|
||||||
|
self.log(f"Flannel apply attempt {attempt}/5 failed; retrying in 15s")
|
||||||
|
time.sleep(15)
|
||||||
|
|
||||||
|
raise RuntimeError(f"Flannel apply failed after retries\n{last_error or ''}")
|
||||||
|
|
||||||
|
def cluster_has_node(self, name):
|
||||||
|
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
|
||||||
|
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||||
|
|
||||||
|
def build_join_cmds(self):
|
||||||
|
join_cmd = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command",
|
||||||
|
).stdout.strip()
|
||||||
|
cert_key = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm init phase upload-certs --upload-certs | tail -n 1",
|
||||||
|
).stdout.strip()
|
||||||
|
cp_join = f"{join_cmd} --control-plane --certificate-key {cert_key}"
|
||||||
|
return join_cmd, cp_join
|
||||||
|
|
||||||
|
def stage_join_control_planes(self):
|
||||||
|
_, cp_join = self.build_join_cmds()
|
||||||
|
for node in self.cp_names:
|
||||||
|
if node == self.primary_cp:
|
||||||
|
continue
|
||||||
|
if self.cluster_has_node(node):
|
||||||
|
self.log(f"{node} already joined")
|
||||||
|
continue
|
||||||
|
self.log(f"Joining control plane {node}")
|
||||||
|
ip = self.node_ips[node]
|
||||||
|
node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR"
|
||||||
|
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
|
||||||
|
|
||||||
|
def stage_join_workers(self):
|
||||||
|
join_cmd, _ = self.build_join_cmds()
|
||||||
|
for node in self.wk_names:
|
||||||
|
if self.cluster_has_node(node):
|
||||||
|
self.log(f"{node} already joined")
|
||||||
|
continue
|
||||||
|
self.log(f"Joining worker {node}")
|
||||||
|
ip = self.node_ips[node]
|
||||||
|
node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR"
|
||||||
|
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
|
||||||
|
|
||||||
|
def stage_verify(self):
|
||||||
|
self.log("Final node verification")
|
||||||
|
try:
|
||||||
|
self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel rollout status ds/kube-flannel-ds --timeout=10m",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
self.log("Flannel rollout failed; collecting diagnostics")
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get ds -o wide || true",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
print(proc.stdout)
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o wide || true",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
print(proc.stdout)
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- describe $p ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel describe $p || true; done",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
print(proc.stdout)
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- logs $p kube-flannel ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c kube-flannel --tail=120 || true; echo \"--- logs $p install-cni-plugin ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni-plugin --tail=120 || true; echo \"--- logs $p install-cni ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni --tail=120 || true; done",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
print(proc.stdout)
|
||||||
|
proc = self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs --tail=120 $p || true; done",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
print(proc.stdout)
|
||||||
|
raise
|
||||||
|
self.remote(
|
||||||
|
self.primary_ip,
|
||||||
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf wait --for=condition=Ready nodes --all --timeout=10m",
|
||||||
|
)
|
||||||
|
proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide")
|
||||||
|
print(proc.stdout)
|
||||||
|
|
||||||
|
def reconcile(self):
|
||||||
|
self.stage_preflight()
|
||||||
|
self.stage_rebuild()
|
||||||
|
self.stage_init_primary()
|
||||||
|
self.stage_install_cni()
|
||||||
|
self.stage_join_control_planes()
|
||||||
|
self.stage_join_workers()
|
||||||
|
self.stage_verify()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="TerraHome kubeadm bootstrap controller")
|
||||||
|
parser.add_argument("command", choices=[
|
||||||
|
"reconcile",
|
||||||
|
"preflight",
|
||||||
|
"rebuild",
|
||||||
|
"init-primary",
|
||||||
|
"install-cni",
|
||||||
|
"join-control-planes",
|
||||||
|
"join-workers",
|
||||||
|
"verify",
|
||||||
|
])
|
||||||
|
parser.add_argument("--inventory", default=str(Path(__file__).resolve().parent.parent / "scripts" / "inventory.env"))
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
cfg = load_inventory(args.inventory)
|
||||||
|
ctl = Controller(cfg)
|
||||||
|
|
||||||
|
dispatch = {
|
||||||
|
"reconcile": ctl.reconcile,
|
||||||
|
"preflight": ctl.stage_preflight,
|
||||||
|
"rebuild": ctl.stage_rebuild,
|
||||||
|
"init-primary": ctl.stage_init_primary,
|
||||||
|
"install-cni": ctl.stage_install_cni,
|
||||||
|
"join-control-planes": ctl.stage_join_control_planes,
|
||||||
|
"join-workers": ctl.stage_join_workers,
|
||||||
|
"verify": ctl.stage_verify,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
dispatch[args.command]()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"ERROR: {exc}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
27
nixos/kubeadm/flake.lock
generated
Normal file
27
nixos/kubeadm/flake.lock
generated
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1767313136,
|
||||||
|
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-25.05",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
77
nixos/kubeadm/flake.nix
Normal file
77
nixos/kubeadm/flake.nix
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
{
|
||||||
|
description = "NixOS kubeadm cluster configs";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { nixpkgs, ... }:
|
||||||
|
let
|
||||||
|
system = "x86_64-linux";
|
||||||
|
lib = nixpkgs.lib;
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
nodeNames = [ "cp-1" "cp-2" "cp-3" "wk-1" "wk-2" "wk-3" ];
|
||||||
|
|
||||||
|
mkNode = {
|
||||||
|
name,
|
||||||
|
role,
|
||||||
|
extraModules ? [ ],
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
roleModule = if role == "control-plane" then ./modules/k8s-control-plane.nix else ./modules/k8s-worker.nix;
|
||||||
|
hardwarePath = ./hosts/hardware + "/${name}.nix";
|
||||||
|
in
|
||||||
|
nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
modules = [
|
||||||
|
./modules/k8s-cluster-settings.nix
|
||||||
|
./modules/k8s-common.nix
|
||||||
|
roleModule
|
||||||
|
({ lib, ... }: {
|
||||||
|
imports = lib.optional (builtins.pathExists hardwarePath) hardwarePath;
|
||||||
|
networking.hostName = name;
|
||||||
|
system.stateVersion = "25.05";
|
||||||
|
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
|
||||||
|
fileSystems."/" = lib.mkDefault {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "ext4";
|
||||||
|
};
|
||||||
|
})
|
||||||
|
] ++ extraModules;
|
||||||
|
};
|
||||||
|
|
||||||
|
mkNodeByName = name:
|
||||||
|
mkNode {
|
||||||
|
inherit name;
|
||||||
|
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
|
||||||
|
};
|
||||||
|
|
||||||
|
mkEvalCheck = name:
|
||||||
|
let
|
||||||
|
cfg = mkNode {
|
||||||
|
inherit name;
|
||||||
|
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
|
||||||
|
extraModules = [
|
||||||
|
({ lib, ... }: {
|
||||||
|
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
|
||||||
|
fileSystems."/" = lib.mkDefault {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "ext4";
|
||||||
|
};
|
||||||
|
})
|
||||||
|
];
|
||||||
|
};
|
||||||
|
in
|
||||||
|
pkgs.runCommand "eval-${name}" { } ''
|
||||||
|
cat > "$out" <<'EOF'
|
||||||
|
host=${cfg.config.networking.hostName}
|
||||||
|
role=${if lib.hasPrefix "cp-" name then "control-plane" else "worker"}
|
||||||
|
stateVersion=${cfg.config.system.stateVersion}
|
||||||
|
EOF
|
||||||
|
'';
|
||||||
|
in {
|
||||||
|
nixosConfigurations = lib.genAttrs nodeNames mkNodeByName;
|
||||||
|
|
||||||
|
checks.${system} = lib.genAttrs nodeNames mkEvalCheck;
|
||||||
|
};
|
||||||
|
}
|
||||||
0
nixos/kubeadm/hosts/hardware/.gitkeep
Normal file
0
nixos/kubeadm/hosts/hardware/.gitkeep
Normal file
212
nixos/kubeadm/manifests/kube-flannel.yml
Normal file
212
nixos/kubeadm/manifests/kube-flannel.yml
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
---
|
||||||
|
kind: Namespace
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: kube-flannel
|
||||||
|
labels:
|
||||||
|
k8s-app: flannel
|
||||||
|
pod-security.kubernetes.io/enforce: privileged
|
||||||
|
---
|
||||||
|
kind: ClusterRole
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
k8s-app: flannel
|
||||||
|
name: flannel
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- pods
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes/status
|
||||||
|
verbs:
|
||||||
|
- patch
|
||||||
|
---
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
k8s-app: flannel
|
||||||
|
name: flannel
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: flannel
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: flannel
|
||||||
|
namespace: kube-flannel
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
k8s-app: flannel
|
||||||
|
name: flannel
|
||||||
|
namespace: kube-flannel
|
||||||
|
---
|
||||||
|
kind: ConfigMap
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: kube-flannel-cfg
|
||||||
|
namespace: kube-flannel
|
||||||
|
labels:
|
||||||
|
tier: node
|
||||||
|
k8s-app: flannel
|
||||||
|
app: flannel
|
||||||
|
data:
|
||||||
|
cni-conf.json: |
|
||||||
|
{
|
||||||
|
"name": "cbr0",
|
||||||
|
"cniVersion": "0.3.1",
|
||||||
|
"plugins": [
|
||||||
|
{
|
||||||
|
"type": "flannel",
|
||||||
|
"delegate": {
|
||||||
|
"hairpinMode": true,
|
||||||
|
"isDefaultGateway": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "portmap",
|
||||||
|
"capabilities": {
|
||||||
|
"portMappings": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
net-conf.json: |
|
||||||
|
{
|
||||||
|
"Network": "10.244.0.0/16",
|
||||||
|
"EnableNFTables": false,
|
||||||
|
"Backend": {
|
||||||
|
"Type": "vxlan"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: kube-flannel-ds
|
||||||
|
namespace: kube-flannel
|
||||||
|
labels:
|
||||||
|
tier: node
|
||||||
|
app: flannel
|
||||||
|
k8s-app: flannel
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: flannel
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
tier: node
|
||||||
|
app: flannel
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/os
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- linux
|
||||||
|
hostNetwork: true
|
||||||
|
priorityClassName: system-node-critical
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
serviceAccountName: flannel
|
||||||
|
initContainers:
|
||||||
|
- name: install-cni-plugin
|
||||||
|
image: docker.io/flannel/flannel-cni-plugin:v1.5.1-flannel1
|
||||||
|
command:
|
||||||
|
- cp
|
||||||
|
args:
|
||||||
|
- -f
|
||||||
|
- /flannel
|
||||||
|
- /opt/cni/bin/flannel
|
||||||
|
volumeMounts:
|
||||||
|
- name: cni-plugin
|
||||||
|
mountPath: /opt/cni/bin
|
||||||
|
- name: install-cni
|
||||||
|
image: docker.io/flannel/flannel:v0.25.5
|
||||||
|
command:
|
||||||
|
- cp
|
||||||
|
args:
|
||||||
|
- -f
|
||||||
|
- /etc/kube-flannel/cni-conf.json
|
||||||
|
- /etc/cni/net.d/10-flannel.conflist
|
||||||
|
volumeMounts:
|
||||||
|
- name: cni
|
||||||
|
mountPath: /etc/cni/net.d
|
||||||
|
- name: flannel-cfg
|
||||||
|
mountPath: /etc/kube-flannel/
|
||||||
|
containers:
|
||||||
|
- name: kube-flannel
|
||||||
|
image: docker.io/flannel/flannel:v0.25.5
|
||||||
|
command:
|
||||||
|
- /opt/bin/flanneld
|
||||||
|
args:
|
||||||
|
- --ip-masq
|
||||||
|
- --kube-subnet-mgr
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "100m"
|
||||||
|
memory: "50Mi"
|
||||||
|
securityContext:
|
||||||
|
privileged: false
|
||||||
|
capabilities:
|
||||||
|
add: ["NET_ADMIN", "NET_RAW"]
|
||||||
|
env:
|
||||||
|
- name: POD_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.name
|
||||||
|
- name: POD_NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
- name: EVENT_QUEUE_DEPTH
|
||||||
|
value: "5000"
|
||||||
|
volumeMounts:
|
||||||
|
- name: run
|
||||||
|
mountPath: /run/flannel
|
||||||
|
- name: flannel-cfg
|
||||||
|
mountPath: /etc/kube-flannel/
|
||||||
|
- name: xtables-lock
|
||||||
|
mountPath: /run/xtables.lock
|
||||||
|
volumes:
|
||||||
|
- name: run
|
||||||
|
hostPath:
|
||||||
|
path: /run/flannel
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: cni-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /opt/cni/bin
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: cni
|
||||||
|
hostPath:
|
||||||
|
path: /etc/cni/net.d
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: flannel-cfg
|
||||||
|
configMap:
|
||||||
|
name: kube-flannel-cfg
|
||||||
|
- name: xtables-lock
|
||||||
|
hostPath:
|
||||||
|
path: /run/xtables.lock
|
||||||
|
type: FileOrCreate
|
||||||
12
nixos/kubeadm/modules/k8s-cluster-settings.nix
Normal file
12
nixos/kubeadm/modules/k8s-cluster-settings.nix
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{ ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
terrahome.kubeadm = {
|
||||||
|
k8sMinor = "1.31";
|
||||||
|
controlPlaneInterface = "eth0";
|
||||||
|
controlPlaneVipSuffix = 250;
|
||||||
|
podSubnet = "10.244.0.0/16";
|
||||||
|
serviceSubnet = "10.96.0.0/12";
|
||||||
|
clusterDomain = "cluster.local";
|
||||||
|
};
|
||||||
|
}
|
||||||
420
nixos/kubeadm/modules/k8s-common.nix
Normal file
420
nixos/kubeadm/modules/k8s-common.nix
Normal file
@@ -0,0 +1,420 @@
|
|||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
|
||||||
|
kubeVipImage = "ghcr.io/kube-vip/kube-vip:v0.8.9";
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.terrahome.kubeadm = {
|
||||||
|
k8sMinor = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "1.31";
|
||||||
|
};
|
||||||
|
|
||||||
|
controlPlaneInterface = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "eth0";
|
||||||
|
};
|
||||||
|
|
||||||
|
controlPlaneVipSuffix = lib.mkOption {
|
||||||
|
type = lib.types.int;
|
||||||
|
default = 250;
|
||||||
|
};
|
||||||
|
|
||||||
|
podSubnet = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "10.244.0.0/16";
|
||||||
|
};
|
||||||
|
|
||||||
|
serviceSubnet = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "10.96.0.0/12";
|
||||||
|
};
|
||||||
|
|
||||||
|
clusterDomain = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "cluster.local";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = {
|
||||||
|
boot.kernelModules = [ "overlay" "br_netfilter" ];
|
||||||
|
|
||||||
|
boot.kernel.sysctl = {
|
||||||
|
"net.ipv4.ip_forward" = 1;
|
||||||
|
"net.bridge.bridge-nf-call-iptables" = 1;
|
||||||
|
"net.bridge.bridge-nf-call-ip6tables" = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
virtualisation.containerd.enable = true;
|
||||||
|
virtualisation.containerd.settings = {
|
||||||
|
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = lib.mkForce [ ];
|
||||||
|
|
||||||
|
services.openssh.enable = true;
|
||||||
|
services.openssh.settings = {
|
||||||
|
PasswordAuthentication = false;
|
||||||
|
KbdInteractiveAuthentication = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
users.users.micqdf = {
|
||||||
|
isNormalUser = true;
|
||||||
|
extraGroups = [ "wheel" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
security.sudo.wheelNeedsPassword = false;
|
||||||
|
|
||||||
|
nix.settings.trusted-users = [ "root" "micqdf" ];
|
||||||
|
nix.gc = {
|
||||||
|
automatic = true;
|
||||||
|
dates = "daily";
|
||||||
|
options = "--delete-older-than 3d";
|
||||||
|
};
|
||||||
|
nix.settings.auto-optimise-store = true;
|
||||||
|
|
||||||
|
environment.variables = {
|
||||||
|
KUBECONFIG = "/etc/kubernetes/admin.conf";
|
||||||
|
KUBE_VIP_IMAGE = kubeVipImage;
|
||||||
|
};
|
||||||
|
|
||||||
|
environment.systemPackages = (with pkgs; [
|
||||||
|
containerd
|
||||||
|
cri-tools
|
||||||
|
cni-plugins
|
||||||
|
pinnedK8s
|
||||||
|
kubernetes-helm
|
||||||
|
conntrack-tools
|
||||||
|
socat
|
||||||
|
ethtool
|
||||||
|
ipvsadm
|
||||||
|
iproute2
|
||||||
|
iptables
|
||||||
|
ebtables
|
||||||
|
jq
|
||||||
|
curl
|
||||||
|
vim
|
||||||
|
gawk
|
||||||
|
]) ++ [
|
||||||
|
(pkgs.writeShellScriptBin "th-kubeadm-init" ''
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||||
|
|
||||||
|
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
|
||||||
|
if ! ip link show "$iface" >/dev/null 2>&1; then
|
||||||
|
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "''${iface:-}" ]; then
|
||||||
|
echo "Could not determine network interface for kube-vip"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
|
||||||
|
pod_subnet="${config.terrahome.kubeadm.podSubnet}"
|
||||||
|
service_subnet="${config.terrahome.kubeadm.serviceSubnet}"
|
||||||
|
domain="${config.terrahome.kubeadm.clusterDomain}"
|
||||||
|
node_name="${config.networking.hostName}"
|
||||||
|
|
||||||
|
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
|
||||||
|
if [ -z "''${local_ip_cidr:-}" ]; then
|
||||||
|
echo "Could not determine IPv4 CIDR on interface $iface"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
|
||||||
|
vip="$subnet_prefix.$suffix"
|
||||||
|
|
||||||
|
echo "Using control-plane endpoint: $vip:6443"
|
||||||
|
echo "Using kube-vip interface: $iface"
|
||||||
|
echo "Using kubeadm node name: $node_name"
|
||||||
|
|
||||||
|
hostname "$node_name" || true
|
||||||
|
|
||||||
|
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||||
|
|
||||||
|
systemctl unmask kubelet || true
|
||||||
|
systemctl stop kubelet || true
|
||||||
|
systemctl reset-failed kubelet || true
|
||||||
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||||
|
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||||
|
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||||
|
rm -rf /var/lib/kubelet/pki
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl unmask kubelet || true
|
||||||
|
systemctl enable kubelet || true
|
||||||
|
|
||||||
|
echo "==> Ensuring containerd is running"
|
||||||
|
systemctl start containerd || true
|
||||||
|
sleep 2
|
||||||
|
if ! systemctl is-active containerd; then
|
||||||
|
echo "ERROR: containerd not running"
|
||||||
|
journalctl -xeu containerd --no-pager -n 30
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p /etc/kubernetes/manifests
|
||||||
|
mkdir -p /tmp/kubeadm
|
||||||
|
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
|
||||||
|
apiVersion: kubeadm.k8s.io/v1beta4
|
||||||
|
kind: InitConfiguration
|
||||||
|
nodeRegistration:
|
||||||
|
name: "KUBEADM_NODE_NAME"
|
||||||
|
criSocket: unix:///run/containerd/containerd.sock
|
||||||
|
kubeletExtraArgs:
|
||||||
|
- name: hostname-override
|
||||||
|
value: "KUBEADM_NODE_NAME"
|
||||||
|
---
|
||||||
|
apiVersion: kubeadm.k8s.io/v1beta4
|
||||||
|
kind: ClusterConfiguration
|
||||||
|
controlPlaneEndpoint: "KUBEADM_ENDPOINT"
|
||||||
|
networking:
|
||||||
|
podSubnet: "KUBEADM_POD_SUBNET"
|
||||||
|
serviceSubnet: "KUBEADM_SERVICE_SUBNET"
|
||||||
|
dnsDomain: "KUBEADM_DNS_DOMAIN"
|
||||||
|
KUBEADMCONFIG
|
||||||
|
|
||||||
|
sed -i "s|KUBEADM_ENDPOINT|$vip:6443|g" /tmp/kubeadm/init-config.yaml
|
||||||
|
sed -i "s|KUBEADM_POD_SUBNET|$pod_subnet|g" /tmp/kubeadm/init-config.yaml
|
||||||
|
sed -i "s|KUBEADM_SERVICE_SUBNET|$service_subnet|g" /tmp/kubeadm/init-config.yaml
|
||||||
|
sed -i "s|KUBEADM_DNS_DOMAIN|$domain|g" /tmp/kubeadm/init-config.yaml
|
||||||
|
sed -i "s|KUBEADM_NODE_NAME|$node_name|g" /tmp/kubeadm/init-config.yaml
|
||||||
|
|
||||||
|
echo "==> Pre-pulling kubeadm images"
|
||||||
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
|
||||||
|
|
||||||
|
echo "==> Creating kube-vip static pod manifest"
|
||||||
|
ctr image pull "${kubeVipImage}"
|
||||||
|
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
||||||
|
--log 4 \
|
||||||
|
--interface "$iface" \
|
||||||
|
--address "$vip" \
|
||||||
|
--controlplane \
|
||||||
|
--arp \
|
||||||
|
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||||
|
|
||||||
|
# kube-vip bootstrap workaround for Kubernetes >=1.29.
|
||||||
|
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
|
||||||
|
sed -i 's#path: /etc/kubernetes/admin.conf#path: /etc/kubernetes/super-admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||||
|
echo "==> kube-vip manifest kubeconfig mount"
|
||||||
|
grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true
|
||||||
|
|
||||||
|
KUBEADM_INIT_LOG=/tmp/kubeadm-init.log
|
||||||
|
if ! env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
|
||||||
|
--config /tmp/kubeadm/init-config.yaml \
|
||||||
|
--upload-certs \
|
||||||
|
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
|
||||||
|
if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
|
||||||
|
echo "==> kubeadm hit CRISocket race; waiting for node registration"
|
||||||
|
echo "==> forcing kubelet restart to pick bootstrap flags"
|
||||||
|
systemctl daemon-reload || true
|
||||||
|
systemctl restart kubelet || true
|
||||||
|
sleep 3
|
||||||
|
echo "==> kubelet bootstrap flags"
|
||||||
|
cat /var/lib/kubelet/kubeadm-flags.env || true
|
||||||
|
registered=0
|
||||||
|
for i in $(seq 1 60); do
|
||||||
|
if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
|
||||||
|
echo "==> node $node_name registered; uploading kubelet config"
|
||||||
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init phase upload-config kubelet --config /tmp/kubeadm/init-config.yaml
|
||||||
|
registered=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
if [ "$registered" -ne 1 ]; then
|
||||||
|
echo "==> node $node_name did not register after kubeadm init failure"
|
||||||
|
KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
|
||||||
|
echo "==> kubelet logs (registration hints)"
|
||||||
|
journalctl -u kubelet --no-pager -n 120 | grep -Ei "register|node|bootstrap|certificate|forbidden|unauthorized|refused|x509" || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "==> kubeadm init failed, checking pod status:"
|
||||||
|
crictl pods || true
|
||||||
|
crictl ps -a || true
|
||||||
|
echo "==> kube-vip containers:"
|
||||||
|
crictl ps -a --name kube-vip || true
|
||||||
|
echo "==> kube-vip logs:"
|
||||||
|
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||||
|
echo "--- kube-vip container $container_id ---"
|
||||||
|
crictl logs "$container_id" 2>/dev/null || true
|
||||||
|
crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true
|
||||||
|
done
|
||||||
|
echo "==> Checking if VIP is bound:"
|
||||||
|
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
|
||||||
|
echo "==> kubelet logs:"
|
||||||
|
journalctl -xeu kubelet --no-pager -n 50
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "==> Waiting for kube-vip to claim VIP $vip"
|
||||||
|
for i in $(seq 1 90); do
|
||||||
|
if ip -4 addr show | grep -q "$vip"; then
|
||||||
|
echo "==> VIP $vip is bound"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ "$i" -eq 90 ]; then
|
||||||
|
echo "==> ERROR: VIP not bound after 3 minutes"
|
||||||
|
crictl ps -a --name kube-vip || true
|
||||||
|
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||||
|
echo "--- kube-vip container $container_id ---"
|
||||||
|
crictl logs "$container_id" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "==> Waiting for API server to be ready"
|
||||||
|
for i in $(seq 1 60); do
|
||||||
|
if curl -sk "https://$vip:6443/healthz" 2>/dev/null | grep -q "ok"; then
|
||||||
|
echo "==> API server is healthy"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ "$i" -eq 60 ]; then
|
||||||
|
echo "==> ERROR: API server not healthy after 2 minutes"
|
||||||
|
crictl pods || true
|
||||||
|
crictl ps -a || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
# Switch kube-vip to normal admin.conf after bootstrap finishes.
|
||||||
|
sed -i 's#path: /etc/kubernetes/super-admin.conf#path: /etc/kubernetes/admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||||
|
|
||||||
|
mkdir -p /root/.kube
|
||||||
|
cp /etc/kubernetes/admin.conf /root/.kube/config
|
||||||
|
chmod 600 /root/.kube/config
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Next: install Cilium, then generate join commands:"
|
||||||
|
echo " kubeadm token create --print-join-command"
|
||||||
|
echo " kubeadm token create --print-join-command --certificate-key <key>"
|
||||||
|
'')
|
||||||
|
|
||||||
|
(pkgs.writeShellScriptBin "th-kubeadm-join-control-plane" ''
|
||||||
|
set -euo pipefail
|
||||||
|
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||||
|
if [ "$#" -lt 1 ]; then
|
||||||
|
echo "Usage: th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
|
||||||
|
if ! ip link show "$iface" >/dev/null 2>&1; then
|
||||||
|
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "''${iface:-}" ]; then
|
||||||
|
echo "Could not determine network interface for kube-vip"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
|
||||||
|
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
|
||||||
|
if [ -z "''${local_ip_cidr:-}" ]; then
|
||||||
|
echo "Could not determine IPv4 CIDR on interface $iface"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
|
||||||
|
vip="$subnet_prefix.$suffix"
|
||||||
|
|
||||||
|
mkdir -p /etc/kubernetes/manifests
|
||||||
|
ctr image pull "${kubeVipImage}"
|
||||||
|
ctr run --rm --net-host "${kubeVipImage}" kube-vip /kube-vip manifest pod \
|
||||||
|
--log 4 \
|
||||||
|
--interface "$iface" \
|
||||||
|
--address "$vip" \
|
||||||
|
--controlplane \
|
||||||
|
--arp \
|
||||||
|
--leaderElection \
|
||||||
|
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||||
|
|
||||||
|
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||||
|
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||||
|
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||||
|
rm -rf /var/lib/kubelet/pki
|
||||||
|
|
||||||
|
systemctl unmask kubelet || true
|
||||||
|
systemctl stop kubelet || true
|
||||||
|
systemctl enable kubelet || true
|
||||||
|
systemctl reset-failed kubelet || true
|
||||||
|
systemctl daemon-reload
|
||||||
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||||
|
eval "$1"
|
||||||
|
'')
|
||||||
|
|
||||||
|
(pkgs.writeShellScriptBin "th-kubeadm-join-worker" ''
|
||||||
|
set -euo pipefail
|
||||||
|
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||||
|
if [ "$#" -lt 1 ]; then
|
||||||
|
echo "Usage: th-kubeadm-join-worker '<kubeadm join ...>'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||||
|
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||||
|
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||||
|
rm -rf /var/lib/kubelet/pki
|
||||||
|
|
||||||
|
systemctl unmask kubelet || true
|
||||||
|
systemctl stop kubelet || true
|
||||||
|
systemctl enable kubelet || true
|
||||||
|
systemctl reset-failed kubelet || true
|
||||||
|
systemctl daemon-reload
|
||||||
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||||
|
eval "$1"
|
||||||
|
'')
|
||||||
|
|
||||||
|
(pkgs.writeShellScriptBin "th-kubeadm-status" ''
|
||||||
|
set -euo pipefail
|
||||||
|
systemctl is-active containerd || true
|
||||||
|
systemctl is-active kubelet || true
|
||||||
|
crictl info >/dev/null && echo "crictl: ok" || echo "crictl: not-ready"
|
||||||
|
'')
|
||||||
|
];
|
||||||
|
|
||||||
|
systemd.services.kubelet = {
|
||||||
|
description = "Kubernetes Kubelet";
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
path = [ pkgs.util-linux ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
after = [ "containerd.service" "network-online.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Environment = [
|
||||||
|
"KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
|
||||||
|
"KUBELET_KUBEADM_ARGS="
|
||||||
|
"KUBELET_EXTRA_ARGS="
|
||||||
|
];
|
||||||
|
EnvironmentFile = [
|
||||||
|
"-/var/lib/kubelet/kubeadm-flags.env"
|
||||||
|
"-/etc/default/kubelet"
|
||||||
|
];
|
||||||
|
ExecStart = "${pinnedK8s}/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf \$KUBELET_CONFIG_ARGS \$KUBELET_KUBEADM_ARGS \$KUBELET_EXTRA_ARGS";
|
||||||
|
Restart = "on-failure";
|
||||||
|
RestartSec = "10";
|
||||||
|
};
|
||||||
|
unitConfig = {
|
||||||
|
ConditionPathExists = "/var/lib/kubelet/config.yaml";
|
||||||
|
ConditionPathExistsGlob = "/etc/kubernetes/*kubelet.conf";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.tmpfiles.rules = [
|
||||||
|
"d /etc/kubernetes 0755 root root -"
|
||||||
|
"d /etc/kubernetes/manifests 0755 root root -"
|
||||||
|
"d /etc/cni/net.d 0755 root root -"
|
||||||
|
"d /opt/cni/bin 0755 root root -"
|
||||||
|
"d /run/flannel 0755 root root -"
|
||||||
|
"d /var/lib/kubelet 0755 root root -"
|
||||||
|
"d /var/lib/kubelet/pki 0755 root root -"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
14
nixos/kubeadm/modules/k8s-control-plane.nix
Normal file
14
nixos/kubeadm/modules/k8s-control-plane.nix
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
networking.firewall.allowedTCPPorts = [
|
||||||
|
6443
|
||||||
|
2379
|
||||||
|
2380
|
||||||
|
10250
|
||||||
|
10257
|
||||||
|
10259
|
||||||
|
];
|
||||||
|
|
||||||
|
networking.firewall.allowedUDPPorts = [
|
||||||
|
8472
|
||||||
|
];
|
||||||
|
}
|
||||||
11
nixos/kubeadm/modules/k8s-worker.nix
Normal file
11
nixos/kubeadm/modules/k8s-worker.nix
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
networking.firewall.allowedTCPPorts = [
|
||||||
|
10250
|
||||||
|
30000
|
||||||
|
32767
|
||||||
|
];
|
||||||
|
|
||||||
|
networking.firewall.allowedUDPPorts = [
|
||||||
|
8472
|
||||||
|
];
|
||||||
|
}
|
||||||
182
nixos/kubeadm/scripts/discover-inventory-from-ssh.py
Executable file
182
nixos/kubeadm/scripts/discover-inventory-from-ssh.py
Executable file
@@ -0,0 +1,182 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import ipaddress
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from typing import Dict, Set, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def derive_prefix(payload: dict) -> str:
|
||||||
|
explicit = os.environ.get("KUBEADM_SUBNET_PREFIX", "").strip()
|
||||||
|
if explicit:
|
||||||
|
return explicit
|
||||||
|
|
||||||
|
for key in ("control_plane_vm_ipv4", "worker_vm_ipv4"):
|
||||||
|
values = payload.get(key, {}).get("value", {})
|
||||||
|
for ip in values.values():
|
||||||
|
if ip:
|
||||||
|
parts = ip.split(".")
|
||||||
|
if len(parts) == 4:
|
||||||
|
return ".".join(parts[:3])
|
||||||
|
|
||||||
|
return "10.27.27"
|
||||||
|
|
||||||
|
|
||||||
|
def ssh_probe(ip: str, users: list[str], key_path: str, timeout_sec: int) -> Tuple[str, str, str] | None:
|
||||||
|
cmd_tail = [
|
||||||
|
"-o",
|
||||||
|
"BatchMode=yes",
|
||||||
|
"-o",
|
||||||
|
"IdentitiesOnly=yes",
|
||||||
|
"-o",
|
||||||
|
"StrictHostKeyChecking=accept-new",
|
||||||
|
"-o",
|
||||||
|
f"ConnectTimeout={timeout_sec}",
|
||||||
|
"-i",
|
||||||
|
key_path,
|
||||||
|
]
|
||||||
|
for user in users:
|
||||||
|
cmd = [
|
||||||
|
"ssh",
|
||||||
|
*cmd_tail,
|
||||||
|
f"{user}@{ip}",
|
||||||
|
"hn=$(hostnamectl --static 2>/dev/null || hostname); serial=$(cat /sys/class/dmi/id/product_serial 2>/dev/null || true); printf '%s|%s\n' \"$hn\" \"$serial\"",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=timeout_sec + 2).strip()
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if out:
|
||||||
|
line = out.splitlines()[0].strip()
|
||||||
|
if "|" in line:
|
||||||
|
host, serial = line.split("|", 1)
|
||||||
|
else:
|
||||||
|
host, serial = line, ""
|
||||||
|
return host.strip(), ip, serial.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_inventory(names: Set[str], found: Dict[str, str], ssh_user: str) -> str:
|
||||||
|
cp = sorted([n for n in names if n.startswith("cp-")], key=lambda x: int(x.split("-")[1]))
|
||||||
|
wk = sorted([n for n in names if n.startswith("wk-")], key=lambda x: int(x.split("-")[1]))
|
||||||
|
|
||||||
|
cp_pairs = " ".join(f"{n}={found[n]}" for n in cp)
|
||||||
|
wk_pairs = " ".join(f"{n}={found[n]}" for n in wk)
|
||||||
|
primary = cp[0] if cp else "cp-1"
|
||||||
|
|
||||||
|
return "\n".join(
|
||||||
|
[
|
||||||
|
f"SSH_USER={ssh_user}",
|
||||||
|
f"PRIMARY_CONTROL_PLANE={primary}",
|
||||||
|
f'CONTROL_PLANES="{cp_pairs}"',
|
||||||
|
f'WORKERS="{wk_pairs}"',
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
payload = json.load(sys.stdin)
|
||||||
|
|
||||||
|
cp_names = set(payload.get("control_plane_vm_ids", {}).get("value", {}).keys())
|
||||||
|
wk_names = set(payload.get("worker_vm_ids", {}).get("value", {}).keys())
|
||||||
|
target_names = cp_names | wk_names
|
||||||
|
if not target_names:
|
||||||
|
raise SystemExit("Could not determine target node names from Terraform outputs")
|
||||||
|
|
||||||
|
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
||||||
|
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
|
||||||
|
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
|
||||||
|
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6"))
|
||||||
|
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32"))
|
||||||
|
|
||||||
|
prefix = derive_prefix(payload)
|
||||||
|
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
|
||||||
|
end = int(os.environ.get("KUBEADM_SUBNET_END", "254"))
|
||||||
|
vip_suffix = int(os.environ.get("KUBEADM_CONTROL_PLANE_VIP_SUFFIX", "250"))
|
||||||
|
|
||||||
|
def is_vip_ip(ip: str) -> bool:
|
||||||
|
try:
|
||||||
|
return int(ip.split(".")[-1]) == vip_suffix
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
scan_ips = [
|
||||||
|
str(ipaddress.IPv4Address(f"{prefix}.{i}"))
|
||||||
|
for i in range(start, end + 1)
|
||||||
|
if i != vip_suffix
|
||||||
|
]
|
||||||
|
found: Dict[str, str] = {}
|
||||||
|
vmid_to_name: Dict[str, str] = {}
|
||||||
|
for name, vmid in payload.get("control_plane_vm_ids", {}).get("value", {}).items():
|
||||||
|
vmid_to_name[str(vmid)] = name
|
||||||
|
for name, vmid in payload.get("worker_vm_ids", {}).get("value", {}).items():
|
||||||
|
vmid_to_name[str(vmid)] = name
|
||||||
|
|
||||||
|
seen_hostnames: Dict[str, str] = {}
|
||||||
|
seen_ips: Dict[str, Tuple[str, str]] = {}
|
||||||
|
|
||||||
|
def run_pass(pass_timeout: int, pass_workers: int) -> None:
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool:
|
||||||
|
futures = [pool.submit(ssh_probe, ip, users, key_path, pass_timeout) for ip in scan_ips]
|
||||||
|
for fut in concurrent.futures.as_completed(futures):
|
||||||
|
result = fut.result()
|
||||||
|
if not result:
|
||||||
|
continue
|
||||||
|
host, ip, serial = result
|
||||||
|
if host not in seen_hostnames:
|
||||||
|
seen_hostnames[host] = ip
|
||||||
|
if ip not in seen_ips:
|
||||||
|
seen_ips[ip] = (host, serial)
|
||||||
|
target = None
|
||||||
|
if serial in vmid_to_name:
|
||||||
|
inferred = vmid_to_name[serial]
|
||||||
|
target = inferred
|
||||||
|
elif host in target_names:
|
||||||
|
target = host
|
||||||
|
|
||||||
|
if target:
|
||||||
|
existing = found.get(target)
|
||||||
|
if existing is None or (is_vip_ip(existing) and not is_vip_ip(ip)):
|
||||||
|
found[target] = ip
|
||||||
|
if all(name in found for name in target_names):
|
||||||
|
return
|
||||||
|
|
||||||
|
run_pass(timeout_sec, max_workers)
|
||||||
|
if not all(name in found for name in target_names):
|
||||||
|
# Slower second pass for busy runners/networks.
|
||||||
|
run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2))
|
||||||
|
|
||||||
|
# Heuristic fallback: if nodes still missing, assign from remaining SSH-reachable
|
||||||
|
# IPs not already used, ordered by IP. This helps when cloned nodes temporarily
|
||||||
|
# share a generic hostname (e.g. "flex") and DMI serial mapping is unavailable.
|
||||||
|
missing = sorted([n for n in target_names if n not in found])
|
||||||
|
if missing:
|
||||||
|
used_ips = set(found.values())
|
||||||
|
candidates = sorted(ip for ip in seen_ips.keys() if ip not in used_ips)
|
||||||
|
if len(candidates) >= len(missing):
|
||||||
|
for name, ip in zip(missing, candidates):
|
||||||
|
found[name] = ip
|
||||||
|
|
||||||
|
missing = sorted([n for n in target_names if n not in found])
|
||||||
|
if missing:
|
||||||
|
discovered = ", ".join(sorted(seen_hostnames.keys())[:20])
|
||||||
|
if discovered:
|
||||||
|
sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n")
|
||||||
|
if seen_ips:
|
||||||
|
sample = ", ".join(f"{ip}={meta[0]}" for ip, meta in list(sorted(seen_ips.items()))[:20])
|
||||||
|
sys.stderr.write(f"SSH-reachable IPs: {sample}\n")
|
||||||
|
raise SystemExit(
|
||||||
|
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
|
||||||
|
f" (scanned {prefix}.{start}-{prefix}.{end})"
|
||||||
|
)
|
||||||
|
|
||||||
|
sys.stdout.write(build_inventory(target_names, found, ssh_user))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
7
nixos/kubeadm/scripts/inventory.example.env
Normal file
7
nixos/kubeadm/scripts/inventory.example.env
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
SSH_USER=micqdf
|
||||||
|
PRIMARY_CONTROL_PLANE=cp-1
|
||||||
|
|
||||||
|
# Name=IP pairs (space-separated)
|
||||||
|
CONTROL_PLANES="cp-1=192.168.1.101 cp-2=192.168.1.102 cp-3=192.168.1.103"
|
||||||
|
|
||||||
|
WORKERS="wk-1=192.168.1.111 wk-2=192.168.1.112 wk-3=192.168.1.113"
|
||||||
14
nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
Executable file
14
nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
|
||||||
|
CONTROLLER="$SCRIPT_DIR/../bootstrap/controller.py"
|
||||||
|
|
||||||
|
if [ ! -f "$INVENTORY_FILE" ]; then
|
||||||
|
echo "Missing inventory file: $INVENTORY_FILE"
|
||||||
|
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 "$CONTROLLER" reconcile --inventory "$INVENTORY_FILE"
|
||||||
65
nixos/kubeadm/scripts/render-inventory-from-tf-output.py
Executable file
65
nixos/kubeadm/scripts/render-inventory-from-tf-output.py
Executable file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def natural_key(name: str):
|
||||||
|
m = re.match(r"^([a-zA-Z-]+)-(\d+)$", name)
|
||||||
|
if m:
|
||||||
|
return (m.group(1), int(m.group(2)))
|
||||||
|
return (name, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def map_to_pairs(items: dict[str, str]) -> str:
|
||||||
|
ordered = sorted(items.items(), key=lambda kv: natural_key(kv[0]))
|
||||||
|
return " ".join(f"{k}={v}" for k, v in ordered)
|
||||||
|
|
||||||
|
|
||||||
|
def require_non_empty_ips(label: str, items: dict[str, str]) -> dict[str, str]:
|
||||||
|
cleaned: dict[str, str] = {}
|
||||||
|
missing: list[str] = []
|
||||||
|
|
||||||
|
for name, ip in items.items():
|
||||||
|
ip_value = (ip or "").strip()
|
||||||
|
if not ip_value:
|
||||||
|
missing.append(name)
|
||||||
|
continue
|
||||||
|
cleaned[name] = ip_value
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
names = ", ".join(sorted(missing, key=natural_key))
|
||||||
|
raise SystemExit(
|
||||||
|
f"Missing IPv4 addresses for {label}: {names}. "
|
||||||
|
"Terraform outputs are present but empty. "
|
||||||
|
"This usually means Proxmox guest IP discovery is unavailable for these VMs yet."
|
||||||
|
)
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
payload = json.load(sys.stdin)
|
||||||
|
|
||||||
|
cp_map = payload.get("control_plane_vm_ipv4", {}).get("value", {})
|
||||||
|
wk_map = payload.get("worker_vm_ipv4", {}).get("value", {})
|
||||||
|
|
||||||
|
if not cp_map or not wk_map:
|
||||||
|
raise SystemExit("Missing control_plane_vm_ipv4 or worker_vm_ipv4 in terraform output")
|
||||||
|
|
||||||
|
cp_map = require_non_empty_ips("control planes", cp_map)
|
||||||
|
wk_map = require_non_empty_ips("workers", wk_map)
|
||||||
|
|
||||||
|
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
||||||
|
|
||||||
|
print(f"SSH_USER={ssh_user}")
|
||||||
|
print("PRIMARY_CONTROL_PLANE=cp-1")
|
||||||
|
print(f"CONTROL_PLANES=\"{map_to_pairs(cp_map)}\"")
|
||||||
|
print(f"WORKERS=\"{map_to_pairs(wk_map)}\"")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
106
nixos/kubeadm/scripts/reset-cluster-nodes.sh
Executable file
106
nixos/kubeadm/scripts/reset-cluster-nodes.sh
Executable file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
|
||||||
|
|
||||||
|
if [ ! -f "$INVENTORY_FILE" ]; then
|
||||||
|
echo "Missing inventory file: $INVENTORY_FILE"
|
||||||
|
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
source "$INVENTORY_FILE"
|
||||||
|
|
||||||
|
SSH_USER="${SSH_USER:-micqdf}"
|
||||||
|
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
|
||||||
|
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
|
||||||
|
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
|
||||||
|
|
||||||
|
declare -A NODE_IPS=()
|
||||||
|
|
||||||
|
add_pair() {
|
||||||
|
local pair="$1"
|
||||||
|
local name="${pair%%=*}"
|
||||||
|
local ip="${pair#*=}"
|
||||||
|
|
||||||
|
if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then
|
||||||
|
echo "Invalid node pair '$pair' (expected name=ip)."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
NODE_IPS["$name"]="$ip"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -n "${CONTROL_PLANES:-}" ]; then
|
||||||
|
for pair in $CONTROL_PLANES; do
|
||||||
|
add_pair "$pair"
|
||||||
|
done
|
||||||
|
else
|
||||||
|
while IFS= read -r var_name; do
|
||||||
|
idx="${var_name#CP_}"
|
||||||
|
add_pair "cp-$idx=${!var_name}"
|
||||||
|
done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "${WORKERS:-}" ]; then
|
||||||
|
for pair in $WORKERS; do
|
||||||
|
add_pair "$pair"
|
||||||
|
done
|
||||||
|
else
|
||||||
|
while IFS= read -r var_name; do
|
||||||
|
idx="${var_name#WK_}"
|
||||||
|
add_pair "wk-$idx=${!var_name}"
|
||||||
|
done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${#NODE_IPS[@]}" -eq 0 ]; then
|
||||||
|
echo "No nodes found in inventory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
detect_ssh_user() {
|
||||||
|
local probe_ip="$1"
|
||||||
|
local candidate
|
||||||
|
|
||||||
|
for candidate in $SSH_USER_CANDIDATES; do
|
||||||
|
if ssh $SSH_OPTS "$candidate@$probe_ip" "true" >/dev/null 2>&1; then
|
||||||
|
ACTIVE_SSH_USER="$candidate"
|
||||||
|
echo "==> Using SSH user '$ACTIVE_SSH_USER'"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Unable to authenticate to $probe_ip with candidates: $SSH_USER_CANDIDATES"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "$HOME/.ssh"
|
||||||
|
chmod 700 "$HOME/.ssh"
|
||||||
|
touch "$HOME/.ssh/known_hosts"
|
||||||
|
chmod 600 "$HOME/.ssh/known_hosts"
|
||||||
|
for node_name in "${!NODE_IPS[@]}"; do
|
||||||
|
ssh-keygen -R "${NODE_IPS[$node_name]}" >/dev/null 2>&1 || true
|
||||||
|
ssh-keyscan -H "${NODE_IPS[$node_name]}" >> "$HOME/.ssh/known_hosts" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
|
||||||
|
reset_node() {
|
||||||
|
local node_name="$1"
|
||||||
|
local node_ip="$2"
|
||||||
|
echo "==> Resetting $node_name ($node_ip)"
|
||||||
|
local cmd="sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d"
|
||||||
|
local quoted_cmd
|
||||||
|
quoted_cmd="$(printf '%q' "$cmd")"
|
||||||
|
ssh $SSH_OPTS "$ACTIVE_SSH_USER@$node_ip" "bash -lc $quoted_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
FIRST_NODE_IP="${NODE_IPS[$(printf '%s\n' "${!NODE_IPS[@]}" | sort -V | head -n1)]}"
|
||||||
|
ACTIVE_SSH_USER="$SSH_USER"
|
||||||
|
detect_ssh_user "$FIRST_NODE_IP"
|
||||||
|
|
||||||
|
while IFS= read -r node_name; do
|
||||||
|
reset_node "$node_name" "${NODE_IPS[$node_name]}"
|
||||||
|
done < <(printf '%s\n' "${!NODE_IPS[@]}" | sort -V)
|
||||||
|
|
||||||
|
echo "Cluster components reset on all listed nodes."
|
||||||
@@ -1,17 +1,16 @@
|
|||||||
# NixOS Proxmox Template Base
|
# NixOS Proxmox k8s-base Template
|
||||||
|
|
||||||
This folder contains a minimal NixOS base config you can copy into a new
|
This folder contains a Kubernetes-ready NixOS base config for your Proxmox
|
||||||
template VM build.
|
template VM build.
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
|
|
||||||
- `flake.nix`: pins `nixos-24.11` and exposes one host config.
|
- `flake.nix`: pins `nixos-25.05` and exposes one host config.
|
||||||
- `configuration.nix`: base settings for Proxmox guest use.
|
- `configuration.nix`: k8s-base settings for Proxmox guests.
|
||||||
|
|
||||||
## Before first apply
|
## Before first apply
|
||||||
|
|
||||||
1. Replace `REPLACE_WITH_YOUR_SSH_PUBLIC_KEY` in `configuration.nix`.
|
1. Add `hardware-configuration.nix` from the VM install:
|
||||||
2. Add `hardware-configuration.nix` from the VM install:
|
|
||||||
- `nixos-generate-config --root /`
|
- `nixos-generate-config --root /`
|
||||||
- copy `/etc/nixos/hardware-configuration.nix` next to `configuration.nix`
|
- copy `/etc/nixos/hardware-configuration.nix` next to `configuration.nix`
|
||||||
|
|
||||||
@@ -23,5 +22,6 @@ sudo nixos-rebuild switch --flake .#template
|
|||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- This is intentionally minimal and avoids cloud-init assumptions.
|
- This pre-installs heavy shared Kubernetes dependencies (containerd + kube tools)
|
||||||
- If you want host-specific settings, create additional modules and import them.
|
to reduce per-node bootstrap time.
|
||||||
|
- Cloud-init still injects the runtime SSH key and per-node hostname/IP.
|
||||||
|
|||||||
@@ -1,12 +1,17 @@
|
|||||||
{ lib, pkgs, ... }:
|
{ lib, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
|
||||||
|
in
|
||||||
|
|
||||||
{
|
{
|
||||||
imports =
|
imports =
|
||||||
lib.optional (builtins.pathExists ./hardware-configuration.nix)
|
lib.optional (builtins.pathExists ./hardware-configuration.nix)
|
||||||
./hardware-configuration.nix;
|
./hardware-configuration.nix;
|
||||||
|
|
||||||
networking.hostName = "nixos-template";
|
networking.hostName = "k8s-base-template";
|
||||||
networking.useDHCP = lib.mkDefault true;
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
networking.useNetworkd = true;
|
||||||
networking.nameservers = [ "1.1.1.1" "8.8.8.8" ];
|
networking.nameservers = [ "1.1.1.1" "8.8.8.8" ];
|
||||||
|
|
||||||
boot.loader.systemd-boot.enable = lib.mkForce false;
|
boot.loader.systemd-boot.enable = lib.mkForce false;
|
||||||
@@ -16,39 +21,73 @@
|
|||||||
};
|
};
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
services.qemuGuest.enable = true;
|
||||||
|
services.cloud-init.enable = true;
|
||||||
|
services.cloud-init.network.enable = true;
|
||||||
services.openssh.enable = true;
|
services.openssh.enable = true;
|
||||||
services.tailscale.enable = true;
|
|
||||||
services.openssh.settings = {
|
services.openssh.settings = {
|
||||||
PasswordAuthentication = false;
|
PasswordAuthentication = false;
|
||||||
KbdInteractiveAuthentication = false;
|
KbdInteractiveAuthentication = false;
|
||||||
PermitRootLogin = "prohibit-password";
|
PermitRootLogin = "prohibit-password";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
boot.kernelModules = [ "overlay" "br_netfilter" ];
|
||||||
|
boot.kernel.sysctl = {
|
||||||
|
"net.ipv4.ip_forward" = 1;
|
||||||
|
"net.bridge.bridge-nf-call-iptables" = 1;
|
||||||
|
"net.bridge.bridge-nf-call-ip6tables" = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
virtualisation.containerd.enable = true;
|
||||||
|
virtualisation.containerd.settings = {
|
||||||
|
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = lib.mkForce [ ];
|
||||||
|
|
||||||
|
nix.settings = {
|
||||||
|
trusted-users = [ "root" "micqdf" ];
|
||||||
|
auto-optimise-store = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
nix.gc = {
|
||||||
|
automatic = true;
|
||||||
|
dates = "daily";
|
||||||
|
options = "--delete-older-than 3d";
|
||||||
|
};
|
||||||
|
|
||||||
programs.fish.enable = true;
|
programs.fish.enable = true;
|
||||||
|
|
||||||
users.users.micqdf = {
|
users.users.micqdf = {
|
||||||
isNormalUser = true;
|
isNormalUser = true;
|
||||||
extraGroups = [ "wheel" ];
|
extraGroups = [ "wheel" ];
|
||||||
shell = pkgs.fish;
|
shell = pkgs.fish;
|
||||||
openssh.authorizedKeys.keys = [
|
|
||||||
"REPLACE_WITH_SINGLE_LINE_PUBLIC_KEY"
|
|
||||||
];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
security.sudo.wheelNeedsPassword = false;
|
security.sudo.wheelNeedsPassword = false;
|
||||||
|
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
btop
|
btop
|
||||||
|
cni-plugins
|
||||||
|
conntrack-tools
|
||||||
|
containerd
|
||||||
|
cri-tools
|
||||||
curl
|
curl
|
||||||
dig
|
dig
|
||||||
|
ebtables
|
||||||
|
ethtool
|
||||||
eza
|
eza
|
||||||
fd
|
fd
|
||||||
fzf
|
fzf
|
||||||
git
|
git
|
||||||
htop
|
htop
|
||||||
|
iproute2
|
||||||
|
iptables
|
||||||
|
ipvsadm
|
||||||
jq
|
jq
|
||||||
|
kubernetes-helm
|
||||||
|
pinnedK8s
|
||||||
ripgrep
|
ripgrep
|
||||||
tailscale
|
socat
|
||||||
tree
|
tree
|
||||||
unzip
|
unzip
|
||||||
vim
|
vim
|
||||||
|
|||||||
27
nixos/template-base/flake.lock
generated
Normal file
27
nixos/template-base/flake.lock
generated
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1767313136,
|
||||||
|
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-25.05",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
description = "Base NixOS config for Proxmox template";
|
description = "Kubernetes-ready NixOS base template";
|
||||||
|
|
||||||
inputs = {
|
inputs = {
|
||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
||||||
};
|
};
|
||||||
|
|
||||||
outputs = { nixpkgs, ... }: {
|
outputs = { nixpkgs, ... }: {
|
||||||
|
|||||||
36
terraform/.terraform.lock.hcl
generated
36
terraform/.terraform.lock.hcl
generated
@@ -1,42 +1,6 @@
|
|||||||
# This file is maintained automatically by "terraform init".
|
# This file is maintained automatically by "terraform init".
|
||||||
# Manual edits may be lost in future updates.
|
# Manual edits may be lost in future updates.
|
||||||
|
|
||||||
provider "registry.terraform.io/hashicorp/local" {
|
|
||||||
version = "2.7.0"
|
|
||||||
hashes = [
|
|
||||||
"h1:2RYa3j7m/0WmET2fqotY4CHxE1Hpk0fgn47/126l+Og=",
|
|
||||||
"zh:261fec71bca13e0a7812dc0d8ae9af2b4326b24d9b2e9beab3d2400fab5c5f9a",
|
|
||||||
"zh:308da3b5376a9ede815042deec5af1050ec96a5a5410a2206ae847d82070a23e",
|
|
||||||
"zh:3d056924c420464dc8aba10e1915956b2e5c4d55b11ffff79aa8be563fbfe298",
|
|
||||||
"zh:643256547b155459c45e0a3e8aab0570db59923c68daf2086be63c444c8c445b",
|
|
||||||
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
|
|
||||||
"zh:7aa4d0b853f84205e8cf79f30c9b2c562afbfa63592f7231b6637e5d7a6b5b27",
|
|
||||||
"zh:7dc251bbc487d58a6ab7f5b07ec9edc630edb45d89b761dba28e0e2ba6b1c11f",
|
|
||||||
"zh:7ee0ca546cd065030039168d780a15cbbf1765a4c70cd56d394734ab112c93da",
|
|
||||||
"zh:b1d5d80abb1906e6c6b3685a52a0192b4ca6525fe090881c64ec6f67794b1300",
|
|
||||||
"zh:d81ea9856d61db3148a4fc6c375bf387a721d78fc1fea7a8823a027272a47a78",
|
|
||||||
"zh:df0a1f0afc947b8bfc88617c1ad07a689ce3bd1a29fd97318392e6bdd32b230b",
|
|
||||||
"zh:dfbcad800240e0c68c43e0866f2a751cff09777375ec701918881acf67a268da",
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
provider "registry.terraform.io/hashicorp/template" {
|
|
||||||
version = "2.2.0"
|
|
||||||
hashes = [
|
|
||||||
"h1:94qn780bi1qjrbC3uQtjJh3Wkfwd5+tTtJHOb7KTg9w=",
|
|
||||||
"zh:01702196f0a0492ec07917db7aaa595843d8f171dc195f4c988d2ffca2a06386",
|
|
||||||
"zh:09aae3da826ba3d7df69efeb25d146a1de0d03e951d35019a0f80e4f58c89b53",
|
|
||||||
"zh:09ba83c0625b6fe0a954da6fbd0c355ac0b7f07f86c91a2a97849140fea49603",
|
|
||||||
"zh:0e3a6c8e16f17f19010accd0844187d524580d9fdb0731f675ffcf4afba03d16",
|
|
||||||
"zh:45f2c594b6f2f34ea663704cc72048b212fe7d16fb4cfd959365fa997228a776",
|
|
||||||
"zh:77ea3e5a0446784d77114b5e851c970a3dde1e08fa6de38210b8385d7605d451",
|
|
||||||
"zh:8a154388f3708e3df5a69122a23bdfaf760a523788a5081976b3d5616f7d30ae",
|
|
||||||
"zh:992843002f2db5a11e626b3fc23dc0c87ad3729b3b3cff08e32ffb3df97edbde",
|
|
||||||
"zh:ad906f4cebd3ec5e43d5cd6dc8f4c5c9cc3b33d2243c89c5fc18f97f7277b51d",
|
|
||||||
"zh:c979425ddb256511137ecd093e23283234da0154b7fa8b21c2687182d9aea8b2",
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
provider "registry.terraform.io/telmate/proxmox" {
|
provider "registry.terraform.io/telmate/proxmox" {
|
||||||
version = "3.0.2-rc07"
|
version = "3.0.2-rc07"
|
||||||
constraints = "3.0.2-rc07"
|
constraints = "3.0.2-rc07"
|
||||||
|
|||||||
@@ -1,14 +0,0 @@
|
|||||||
data "template_file" "cloud_init_global" {
|
|
||||||
template = file("${path.module}/files/cloud_init_global.tpl")
|
|
||||||
|
|
||||||
vars = {
|
|
||||||
hostname = "generic"
|
|
||||||
domain = "home.arpa"
|
|
||||||
SSH_KEY_PUBLIC = var.SSH_KEY_PUBLIC
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "local_file" "cloud_init_global" {
|
|
||||||
content = data.template_file.cloud_init_global.rendered
|
|
||||||
filename = "${path.module}/files/rendered/cloud_init_global.yaml"
|
|
||||||
}
|
|
||||||
@@ -9,6 +9,15 @@ terraform {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
control_plane_ipconfig = [
|
||||||
|
for ip in var.control_plane_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||||
|
]
|
||||||
|
worker_ipconfig = [
|
||||||
|
for ip in var.worker_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
provider "proxmox" {
|
provider "proxmox" {
|
||||||
pm_api_url = var.pm_api_url
|
pm_api_url = var.pm_api_url
|
||||||
pm_api_token_id = var.pm_api_token_id
|
pm_api_token_id = var.pm_api_token_id
|
||||||
@@ -16,33 +25,35 @@ provider "proxmox" {
|
|||||||
pm_tls_insecure = true
|
pm_tls_insecure = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "proxmox_vm_qemu" "alpacas" {
|
resource "proxmox_vm_qemu" "control_planes" {
|
||||||
count = var.alpaca_vm_count
|
count = var.control_plane_count
|
||||||
name = "alpaca-${count.index + 1}"
|
name = "cp-${count.index + 1}"
|
||||||
vmid = 500 + count.index + 1
|
vmid = var.control_plane_vmid_start + count.index
|
||||||
target_node = var.target_node
|
target_node = var.target_node
|
||||||
clone = var.clone_template
|
clone = var.clone_template
|
||||||
full_clone = true
|
full_clone = true
|
||||||
os_type = "cloud-init"
|
os_type = "cloud-init"
|
||||||
agent = 1
|
agent = var.qemu_agent_enabled ? 1 : 0
|
||||||
|
automatic_reboot = true
|
||||||
|
|
||||||
cpu {
|
cpu {
|
||||||
sockets = var.sockets
|
sockets = 1
|
||||||
cores = var.cores
|
cores = var.control_plane_cores
|
||||||
}
|
}
|
||||||
memory = var.memory
|
memory = var.control_plane_memory_mb
|
||||||
scsihw = "virtio-scsi-pci"
|
scsihw = "virtio-scsi-pci"
|
||||||
boot = "order=scsi0"
|
boot = "order=scsi0"
|
||||||
bootdisk = "scsi0"
|
bootdisk = "scsi0"
|
||||||
ipconfig0 = "ip=dhcp"
|
ipconfig0 = local.control_plane_ipconfig[count.index]
|
||||||
cicustom = "user=local:snippets/cloud_init_global.yaml"
|
ciuser = "micqdf"
|
||||||
|
sshkeys = var.SSH_KEY_PUBLIC
|
||||||
|
|
||||||
|
|
||||||
disks {
|
disks {
|
||||||
scsi {
|
scsi {
|
||||||
scsi0 {
|
scsi0 {
|
||||||
disk {
|
disk {
|
||||||
size = var.disk_size
|
size = var.control_plane_disk_size
|
||||||
storage = var.storage
|
storage = var.storage
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -62,35 +73,41 @@ resource "proxmox_vm_qemu" "alpacas" {
|
|||||||
model = "virtio"
|
model = "virtio"
|
||||||
bridge = var.bridge
|
bridge = var.bridge
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = all
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
resource "proxmox_vm_qemu" "llamas" {
|
resource "proxmox_vm_qemu" "workers" {
|
||||||
count = var.llama_vm_count
|
count = var.worker_count
|
||||||
name = "llama-${count.index + 1}"
|
name = "wk-${count.index + 1}"
|
||||||
vmid = 600 + count.index + 1
|
vmid = var.worker_vmid_start + count.index
|
||||||
target_node = var.target_node
|
target_node = var.target_node
|
||||||
clone = var.clone_template
|
clone = var.clone_template
|
||||||
full_clone = true
|
full_clone = true
|
||||||
os_type = "cloud-init"
|
os_type = "cloud-init"
|
||||||
agent = 1
|
agent = var.qemu_agent_enabled ? 1 : 0
|
||||||
|
automatic_reboot = true
|
||||||
|
|
||||||
cpu {
|
cpu {
|
||||||
sockets = var.sockets
|
sockets = 1
|
||||||
cores = var.cores
|
cores = var.worker_cores[count.index]
|
||||||
}
|
}
|
||||||
memory = var.memory
|
memory = var.worker_memory_mb[count.index]
|
||||||
scsihw = "virtio-scsi-pci"
|
scsihw = "virtio-scsi-pci"
|
||||||
boot = "order=scsi0"
|
boot = "order=scsi0"
|
||||||
bootdisk = "scsi0"
|
bootdisk = "scsi0"
|
||||||
ipconfig0 = "ip=dhcp"
|
ipconfig0 = local.worker_ipconfig[count.index]
|
||||||
cicustom = "user=local:snippets/cloud_init_global.yaml"
|
ciuser = "micqdf"
|
||||||
|
sshkeys = var.SSH_KEY_PUBLIC
|
||||||
|
|
||||||
disks {
|
disks {
|
||||||
scsi {
|
scsi {
|
||||||
scsi0 {
|
scsi0 {
|
||||||
disk {
|
disk {
|
||||||
size = var.disk_size
|
size = var.worker_disk_size
|
||||||
storage = var.storage
|
storage = var.storage
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -111,4 +128,8 @@ resource "proxmox_vm_qemu" "llamas" {
|
|||||||
model = "virtio"
|
model = "virtio"
|
||||||
bridge = var.bridge
|
bridge = var.bridge
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = all
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,21 +1,35 @@
|
|||||||
output "alpaca_vm_ids" {
|
output "control_plane_vm_ids" {
|
||||||
value = {
|
value = {
|
||||||
for i in range(var.alpaca_vm_count) :
|
for i in range(var.control_plane_count) :
|
||||||
"alpaca-${i + 1}" => proxmox_vm_qemu.alpacas[i].vmid
|
"cp-${i + 1}" => proxmox_vm_qemu.control_planes[i].vmid
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output "alpaca_vm_names" {
|
output "control_plane_vm_names" {
|
||||||
value = [for vm in proxmox_vm_qemu.alpacas : vm.name]
|
value = [for vm in proxmox_vm_qemu.control_planes : vm.name]
|
||||||
}
|
}
|
||||||
|
|
||||||
output "llama_vm_ids" {
|
output "control_plane_vm_ipv4" {
|
||||||
value = {
|
value = {
|
||||||
for i in range(var.llama_vm_count) :
|
for i in range(var.control_plane_count) :
|
||||||
"llama-${i + 1}" => proxmox_vm_qemu.llamas[i].vmid
|
proxmox_vm_qemu.control_planes[i].name => var.control_plane_ips[i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output "llama_vm_names" {
|
output "worker_vm_ids" {
|
||||||
value = [for vm in proxmox_vm_qemu.llamas : vm.name]
|
value = {
|
||||||
|
for i in range(var.worker_count) :
|
||||||
|
"wk-${i + 1}" => proxmox_vm_qemu.workers[i].vmid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output "worker_vm_names" {
|
||||||
|
value = [for vm in proxmox_vm_qemu.workers : vm.name]
|
||||||
|
}
|
||||||
|
|
||||||
|
output "worker_vm_ipv4" {
|
||||||
|
value = {
|
||||||
|
for i in range(var.worker_count) :
|
||||||
|
proxmox_vm_qemu.workers[i].name => var.worker_ips[i]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,25 @@
|
|||||||
target_node = "flex"
|
target_node = "flex"
|
||||||
clone_template = "nixos-template"
|
clone_template = "k8s-base-template"
|
||||||
cores = 1
|
|
||||||
memory = 1024
|
|
||||||
disk_size = "15G"
|
|
||||||
sockets = 1
|
|
||||||
bridge = "vmbr0"
|
bridge = "vmbr0"
|
||||||
storage = "Flash"
|
storage = "Flash"
|
||||||
pm_api_url = "https://100.105.0.115:8006/api2/json"
|
pm_api_url = "https://100.105.0.115:8006/api2/json"
|
||||||
pm_api_token_id = "terraform-prov@pve!mytoken"
|
pm_api_token_id = "terraform-prov@pve!mytoken"
|
||||||
|
|
||||||
|
control_plane_count = 3
|
||||||
|
worker_count = 3
|
||||||
|
control_plane_vmid_start = 701
|
||||||
|
worker_vmid_start = 711
|
||||||
|
|
||||||
|
control_plane_cores = 1
|
||||||
|
control_plane_memory_mb = 4096
|
||||||
|
control_plane_disk_size = "80G"
|
||||||
|
|
||||||
|
worker_cores = [4, 4, 4]
|
||||||
|
worker_memory_mb = [12288, 12288, 12288]
|
||||||
|
worker_disk_size = "120G"
|
||||||
|
|
||||||
|
network_prefix_length = 10
|
||||||
|
network_gateway = "10.27.27.1"
|
||||||
|
|
||||||
|
control_plane_ips = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||||
|
worker_ips = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||||
|
|||||||
@@ -27,20 +27,98 @@ variable "clone_template" {
|
|||||||
type = string
|
type = string
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "cores" {
|
variable "control_plane_count" {
|
||||||
type = number
|
type = number
|
||||||
|
default = 3
|
||||||
|
description = "Number of control plane VMs"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "memory" {
|
variable "worker_count" {
|
||||||
type = number
|
type = number
|
||||||
|
default = 3
|
||||||
|
description = "Number of worker VMs"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "disk_size" {
|
variable "control_plane_vmid_start" {
|
||||||
type = string
|
type = number
|
||||||
|
default = 701
|
||||||
|
description = "Starting VMID for control plane VMs"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "sockets" {
|
variable "worker_vmid_start" {
|
||||||
type = number
|
type = number
|
||||||
|
default = 711
|
||||||
|
description = "Starting VMID for worker VMs"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "control_plane_cores" {
|
||||||
|
type = number
|
||||||
|
default = 1
|
||||||
|
description = "vCPU cores per control plane VM"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "control_plane_memory_mb" {
|
||||||
|
type = number
|
||||||
|
default = 4096
|
||||||
|
description = "Memory in MB per control plane VM"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_cores" {
|
||||||
|
type = list(number)
|
||||||
|
default = [4, 4, 4]
|
||||||
|
description = "vCPU cores for each worker VM"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_memory_mb" {
|
||||||
|
type = list(number)
|
||||||
|
default = [12288, 12288, 12288]
|
||||||
|
description = "Memory in MB for each worker VM"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "control_plane_disk_size" {
|
||||||
|
type = string
|
||||||
|
default = "80G"
|
||||||
|
description = "Disk size for control plane VMs"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_disk_size" {
|
||||||
|
type = string
|
||||||
|
default = "120G"
|
||||||
|
description = "Disk size for worker VMs"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "network_prefix_length" {
|
||||||
|
type = number
|
||||||
|
default = 10
|
||||||
|
description = "CIDR prefix length for static VM addresses"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "network_gateway" {
|
||||||
|
type = string
|
||||||
|
default = "10.27.27.1"
|
||||||
|
description = "Gateway for static VM addresses"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "control_plane_ips" {
|
||||||
|
type = list(string)
|
||||||
|
default = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||||
|
description = "Static IPv4 addresses for control plane VMs"
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.control_plane_ips) == 3
|
||||||
|
error_message = "control_plane_ips must contain exactly 3 IPs."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_ips" {
|
||||||
|
type = list(string)
|
||||||
|
default = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||||
|
description = "Static IPv4 addresses for worker VMs"
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.worker_ips) == 3
|
||||||
|
error_message = "worker_ips must contain exactly 3 IPs."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "bridge" {
|
variable "bridge" {
|
||||||
@@ -55,16 +133,10 @@ variable "pm_api_url" {
|
|||||||
type = string
|
type = string
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "alpaca_vm_count" {
|
variable "qemu_agent_enabled" {
|
||||||
type = number
|
type = bool
|
||||||
default = 1
|
default = false
|
||||||
description = "How many Alpaca VMs to create"
|
description = "Enable QEMU guest agent integration in Proxmox resources"
|
||||||
}
|
|
||||||
|
|
||||||
variable "llama_vm_count" {
|
|
||||||
type = number
|
|
||||||
default = 1
|
|
||||||
description = "How many Llama VMs to create"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "SSH_KEY_PUBLIC" {
|
variable "SSH_KEY_PUBLIC" {
|
||||||
|
|||||||
Reference in New Issue
Block a user