Compare commits
331 Commits
destroy
...
15e6471e7e
| Author | SHA1 | Date | |
|---|---|---|---|
| 15e6471e7e | |||
| 79a4c941e5 | |||
| e9bac70cae | |||
| 4c167f618a | |||
| 97295a7071 | |||
| 7bc861b3e8 | |||
| 6ca189b32c | |||
| b7b364a112 | |||
| 2aa9950f59 | |||
| bd866f7dac | |||
| c1f86483ad | |||
| 0cce4bcf72 | |||
| 065567210e | |||
| c5f0b1ac37 | |||
| e740d47011 | |||
| d9d3976c4c | |||
| a0b07816b9 | |||
| d964ff8b50 | |||
| e06b2c692e | |||
| c48bbddef3 | |||
| ca54c44fa4 | |||
| 8bda08be07 | |||
| 0778de9719 | |||
| 92f0658995 | |||
| fc4eb1bc6e | |||
| 4b017364c8 | |||
| a70de061b0 | |||
| 9d98f56725 | |||
| 5ddd00f711 | |||
| 5af4021228 | |||
| 034869347a | |||
| 50d0d99332 | |||
| f0093deedc | |||
| 6b6ca021c9 | |||
| c034f7975c | |||
| 90ef0ec33f | |||
| ba6cf42c04 | |||
| 3cd0c70727 | |||
| 3281ebd216 | |||
| d2dd6105a6 | |||
| 981afc509a | |||
| b3c975bd73 | |||
| 8aab666fad | |||
| 308a2fd4b7 | |||
| 3fd7ed48b1 | |||
| 0cc0de2aea | |||
| 99458ca829 | |||
| 422b7d7f23 | |||
| adc8a620f4 | |||
| 3ebeb121b4 | |||
| f11aadf79c | |||
| b4265a649e | |||
| 09d2f56967 | |||
| 9ae8eb6134 | |||
| f2b9da8a59 | |||
| a66ae788f6 | |||
| 5fa96e27d7 | |||
| cbb8358ce6 | |||
| 31017b5c3e | |||
| a16112a87a | |||
| f53d087c9c | |||
| 51b56e562e | |||
| 0e0643a6fc | |||
| 6fecfb3ee6 | |||
| 7a0016b003 | |||
| 355273add5 | |||
| e5162c220c | |||
| 262e9eb4d7 | |||
| 84513f4bb8 | |||
| c445638d4a | |||
| 678b383063 | |||
| 880bbcceca | |||
| 190dc2e095 | |||
| d86b0a32a2 | |||
| a81799a2b5 | |||
| 6c7182b8f5 | |||
| 46c0786e57 | |||
| 8b15f061bc | |||
| 1af45ca51e | |||
| c91d28a5dc | |||
| 533f5a91e0 | |||
| cfdfab3ec0 | |||
| c061dda31d | |||
| cec60c003c | |||
| fb21fbef4f | |||
| 6cc57f8b0e | |||
| 1b76e07326 | |||
| 9d17dd17cc | |||
| db72dcab75 | |||
| 23d61a6308 | |||
| d42e83358c | |||
| 198c147b79 | |||
| 93e43a546f | |||
| 3b03e68f3e | |||
| ab5cc8b01d | |||
| 92759407a6 | |||
| f65a414959 | |||
| 03c6d0454a | |||
| 7c849ed019 | |||
| b8bd9686d3 | |||
| 388b0c4f5d | |||
| cfd72fa750 | |||
| d810547675 | |||
| 3ed3381140 | |||
| 9426968cd4 | |||
| 4569fcd2ea | |||
| 02a6bca60b | |||
| f7f3c7df3e | |||
| a098c0aa29 | |||
| 766cd5db4f | |||
| 9b03cec23e | |||
| 5fe36d0963 | |||
| c794e07ab2 | |||
| 8103b02883 | |||
| fd7be1a428 | |||
| 6262f61506 | |||
| c0b820c92a | |||
| f9e7356f94 | |||
| 27185ed17a | |||
| 9baf35d886 | |||
| a5f0f0a420 | |||
| 310d273378 | |||
| 661fbc2ff4 | |||
| 3b0219f211 | |||
| 3fa227d7c9 | |||
| 61db9a26d9 | |||
| 8f915201e3 | |||
| a933341c28 | |||
| f90e971fab | |||
| 920c0c10b8 | |||
| 718a9930e8 | |||
| a9f6153623 | |||
| 9edb8f807d | |||
| 7ec1ce92cf | |||
| 198f0e2910 | |||
| 88db11292d | |||
| 8bd064c828 | |||
| 364d407fb7 | |||
| c8771b897c | |||
| 68c896d629 | |||
| 39f1e44f9b | |||
| 760d0e8b5b | |||
| e48726934f | |||
| 92a0908ff5 | |||
| 3bdf3f8d84 | |||
| 42b931668f | |||
| dad409a5b7 | |||
| 4d6ac7d9dd | |||
| 0a51dfc0e1 | |||
| 92084c3e1a | |||
| 6a77c96ad9 | |||
| 45e818b113 | |||
| 47ec65a7fd | |||
| 97795fe376 | |||
| 24c3f56399 | |||
| f5d9eba9d0 | |||
| 3e720f1d58 | |||
| 23a85cc099 | |||
| 824e3c09d1 | |||
| 327c07314c | |||
| 21425c363d | |||
| f6805f8a39 | |||
| 3b5d04dda2 | |||
| f5675d2a84 | |||
| cf98bdf229 | |||
| ba912810d1 | |||
| 727c21e43b | |||
| 70ff5ccef9 | |||
| 5c037d9a99 | |||
| 244887e9c2 | |||
| 129c639e4d | |||
| 6105a314b7 | |||
| 89bc2242cb | |||
| fce8f9c70c | |||
| c1c1b3d7f7 | |||
| cc40dff49a | |||
| 812fcb8066 | |||
| d190f64181 | |||
| 2126cf5004 | |||
| 2a5ecebd99 | |||
| 17ac3fad4c | |||
| 3ee5cfa823 | |||
| 2078afa8a3 | |||
| 2d9d6cdcd5 | |||
| 8b363497b7 | |||
| 03fff813ac | |||
| a8195f97dc | |||
| c94c1f61d8 | |||
| 7cdb0bb00b | |||
| 046de9b3d4 | |||
| b75e6b0124 | |||
| b6ce31ad6c | |||
| 6f2fa0ef06 | |||
| 71890c00c0 | |||
| f8379e6d08 | |||
| 8d809355eb | |||
| 0f171a668b | |||
| 7759c47fea | |||
| 8b83bb9d3a | |||
| 9e922dd62c | |||
| 3539ae9b50 | |||
| 5669305e59 | |||
| f341816112 | |||
| c04ef106a3 | |||
| c154ff4d15 | |||
| 8bcc162956 | |||
| b0779c51c0 | |||
| 9fe845b53d | |||
| 885a92f494 | |||
| 91dd20e60e | |||
| abac6300ca | |||
| 7206d8cd41 | |||
| a42d44bb27 | |||
| a99516a2a3 | |||
| 5c69abf9ff | |||
| 5fc8bcc406 | |||
| 16d5a87586 | |||
| 9a02c05983 | |||
| 1304afd793 | |||
| d1dcbe0feb | |||
| df4740071a | |||
| 54c0b684c8 | |||
| 2577669e12 | |||
| dd3a37dfd1 | |||
| 35f0a0dccb | |||
| 583d5c3591 | |||
| 77626ed93c | |||
| a5d5ddb618 | |||
| a5f8d72bff | |||
| 335254b7b2 | |||
| 21be01346b | |||
| ba1884bbc5 | |||
| c516c8ba35 | |||
| 8b8bab77b0 | |||
| 93bba9fbfc | |||
| 6ef807e59c | |||
| 8887a8bb87 | |||
| 32b1fcec58 | |||
| c87bb16f10 | |||
| a891109ee9 | |||
| 0ea9888854 | |||
| 3261b18f37 | |||
| 2d455929bd | |||
| 9740e9c6fb | |||
| f12e15e566 | |||
| b3521d6c02 | |||
| 17834b3aa7 | |||
| 017d5ce00d | |||
| 6fada2f32a | |||
| 510ba707ad | |||
| a2d61d6972 | |||
| 6fbc4dd80f | |||
| 5acb8370cc | |||
| f207f774de | |||
| 1a309cbe4f | |||
| 83d277d144 | |||
| 5e1fd2e9f3 | |||
| 3335020db5 | |||
| 9ce06671c9 | |||
| a7f68c0c4b | |||
| d1a7ccc98c | |||
| afe19041d9 | |||
| c9be2a2fc8 | |||
| 5fc58dfc98 | |||
| 1c4a27bca3 | |||
| 47f950d667 | |||
| b0768db7a7 | |||
| c0dd091b51 | |||
| 595df12b3e | |||
| 735e9df9f1 | |||
| e714a56980 | |||
| 4247d16c24 | |||
| 59fbbb07df | |||
| c3a0ef251c | |||
| 841abb8fe3 | |||
| 364dc6b35b | |||
| 9c1476b6bf | |||
| 4a123e0fb6 | |||
| 5633d18276 | |||
| c6fc9edcc4 | |||
| c8b86c7443 | |||
| 79b535bb59 | |||
| 84e45b4c61 | |||
|
|
080752e8a0 | ||
|
|
f063baa349 | ||
| bada1b69da | |||
|
|
7d04a2c475 | ||
|
|
e04f10c5a3 | ||
|
|
0e7860bfe7 | ||
|
|
0c0cbc5def | ||
|
|
fcdde6cf1f | ||
|
|
524bd92da4 | ||
|
|
ba3fe8e7ff | ||
|
|
724a433d5e | ||
|
|
bfbf0680e2 | ||
|
|
8f1ee24440 | ||
|
|
73dd2e18ff | ||
| 8d9eea6728 | |||
|
|
96f6d94c3a | ||
| 8d49e447e6 | |||
|
|
99f3610a84 | ||
| d634e124a3 | |||
|
|
70b9b5e5b7 | ||
|
|
93d3f94100 | ||
| 70139b2693 | |||
|
|
8773f5026c | ||
| 1b6eca0f69 | |||
|
|
9551e0ad53 | ||
|
|
ffc1c1e785 | ||
| 3e55a72767 | |||
|
|
fcbd6a0b1d | ||
|
|
7227782d4f | ||
|
|
6dec58856e | ||
|
|
437d7ab8d1 | ||
|
|
ac2db5a1cf | ||
|
|
74b2fb8175 | ||
|
|
1acd33cb87 | ||
|
|
f9edeb8be5 | ||
|
|
661fb95830 | ||
|
|
50ae59602c | ||
|
|
507c102dad | ||
| b26ff582a4 | |||
|
|
ec07db08db | ||
| 114bfb9772 | |||
| 5509e14066 | |||
| df088a7903 | |||
| dcec6c3648 | |||
| a0ee1b8a4b | |||
| 39d4e2ac65 | |||
| 6d06cfac02 | |||
| e669353638 |
181
.gitea/workflows/kubeadm-bootstrap.yml
Normal file
181
.gitea/workflows/kubeadm-bootstrap.yml
Normal file
@@ -0,0 +1,181 @@
|
||||
name: Kubeadm Bootstrap
|
||||
run-name: ${{ gitea.actor }} requested kubeadm bootstrap
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
confirm:
|
||||
description: "Type BOOTSTRAP to run rebuild + kubeadm bootstrap"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
concurrency:
|
||||
group: kubeadm-bootstrap
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
bootstrap:
|
||||
name: "Rebuild and Bootstrap Cluster"
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Validate confirmation phrase
|
||||
run: |
|
||||
if [ "${{ inputs.confirm }}" != "BOOTSTRAP" ]; then
|
||||
echo "Confirmation failed. You must type BOOTSTRAP."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout repository
|
||||
uses: https://gitea.com/actions/checkout@v4
|
||||
|
||||
- name: Create SSH key
|
||||
run: |
|
||||
install -m 0700 -d ~/.ssh
|
||||
KEY_SOURCE=""
|
||||
KEY_CONTENT=""
|
||||
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||
if [ -n "$KEY_B64" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||
if [ -n "$KEY_CONTENT" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$KEY_CONTENT" ]; then
|
||||
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
else
|
||||
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
fi
|
||||
chmod 0600 ~/.ssh/id_ed25519
|
||||
|
||||
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||
echo "Invalid private key content from $KEY_SOURCE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set up Terraform
|
||||
uses: hashicorp/setup-terraform@v2
|
||||
with:
|
||||
terraform_version: 1.6.6
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Build Terraform backend files
|
||||
working-directory: terraform
|
||||
run: |
|
||||
cat > secrets.auto.tfvars << EOF
|
||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||
EOF
|
||||
|
||||
cat > backend.hcl << EOF
|
||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-east-005"
|
||||
endpoints = {
|
||||
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||
}
|
||||
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||
skip_credentials_validation = true
|
||||
skip_metadata_api_check = true
|
||||
skip_region_validation = true
|
||||
skip_requesting_account_id = true
|
||||
use_path_style = true
|
||||
EOF
|
||||
|
||||
- name: Terraform init for state read
|
||||
working-directory: terraform
|
||||
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||
|
||||
- name: Create kubeadm inventory
|
||||
env:
|
||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||
|
||||
- name: Validate nix installation
|
||||
run: |
|
||||
if [ -x /nix/var/nix/profiles/default/bin/nix ]; then
|
||||
/nix/var/nix/profiles/default/bin/nix --version
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if command -v nix >/dev/null 2>&1; then
|
||||
nix --version
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Nix missing; installing no-daemon Nix for this runner job"
|
||||
if [ "$(id -u)" -eq 0 ]; then
|
||||
mkdir -p /nix
|
||||
chown root:root /nix
|
||||
chmod 0755 /nix
|
||||
|
||||
if ! getent group nixbld >/dev/null 2>&1; then
|
||||
groupadd --system nixbld
|
||||
fi
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
if ! id "nixbld$i" >/dev/null 2>&1; then
|
||||
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
|
||||
fi
|
||||
usermod -a -G nixbld "nixbld$i"
|
||||
done
|
||||
fi
|
||||
sh <(curl -L https://nixos.org/nix/install) --no-daemon
|
||||
|
||||
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||
fi
|
||||
|
||||
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||
|
||||
nix --version
|
||||
|
||||
- name: Install nixos-rebuild tool
|
||||
env:
|
||||
NIX_CONFIG: experimental-features = nix-command flakes
|
||||
run: |
|
||||
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||
fi
|
||||
|
||||
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||
|
||||
nix profile install nixpkgs#nixos-rebuild
|
||||
|
||||
- name: Run cluster rebuild and bootstrap
|
||||
env:
|
||||
NIX_CONFIG: experimental-features = nix-command flakes
|
||||
FAST_MODE: "1"
|
||||
WORKER_PARALLELISM: "3"
|
||||
REBUILD_TIMEOUT: "45m"
|
||||
REBUILD_RETRIES: "2"
|
||||
run: |
|
||||
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||
fi
|
||||
|
||||
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
|
||||
|
||||
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
|
||||
112
.gitea/workflows/kubeadm-reset.yml
Normal file
112
.gitea/workflows/kubeadm-reset.yml
Normal file
@@ -0,0 +1,112 @@
|
||||
name: Kubeadm Reset
|
||||
run-name: ${{ gitea.actor }} requested kubeadm reset
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
confirm:
|
||||
description: "Type RESET to run kubeadm reset on all nodes"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
concurrency:
|
||||
group: kubeadm-bootstrap
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
reset:
|
||||
name: "Reset Cluster Nodes"
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Validate confirmation phrase
|
||||
run: |
|
||||
if [ "${{ inputs.confirm }}" != "RESET" ]; then
|
||||
echo "Confirmation failed. You must type RESET."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout repository
|
||||
uses: https://gitea.com/actions/checkout@v4
|
||||
|
||||
- name: Create SSH key
|
||||
run: |
|
||||
install -m 0700 -d ~/.ssh
|
||||
KEY_SOURCE=""
|
||||
KEY_CONTENT=""
|
||||
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||
if [ -n "$KEY_B64" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||
if [ -n "$KEY_CONTENT" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$KEY_CONTENT" ]; then
|
||||
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
else
|
||||
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
fi
|
||||
chmod 0600 ~/.ssh/id_ed25519
|
||||
|
||||
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||
echo "Invalid private key content from $KEY_SOURCE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set up Terraform
|
||||
uses: hashicorp/setup-terraform@v2
|
||||
with:
|
||||
terraform_version: 1.6.6
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Build Terraform backend files
|
||||
working-directory: terraform
|
||||
run: |
|
||||
cat > secrets.auto.tfvars << EOF
|
||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||
EOF
|
||||
|
||||
cat > backend.hcl << EOF
|
||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-east-005"
|
||||
endpoints = {
|
||||
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||
}
|
||||
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||
skip_credentials_validation = true
|
||||
skip_metadata_api_check = true
|
||||
skip_region_validation = true
|
||||
skip_requesting_account_id = true
|
||||
use_path_style = true
|
||||
EOF
|
||||
|
||||
- name: Terraform init for state read
|
||||
working-directory: terraform
|
||||
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||
|
||||
- name: Create kubeadm inventory
|
||||
env:
|
||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||
|
||||
- name: Run cluster reset
|
||||
run: |
|
||||
./nixos/kubeadm/scripts/reset-cluster-nodes.sh
|
||||
@@ -1,47 +1,209 @@
|
||||
name: Gitea Actions Demo
|
||||
run-name: ${{ gitea.actor }} is deploying with Terraform 🚀
|
||||
name: Terraform Apply
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: terraform-global
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
terraform:
|
||||
name: "Terraform Apply"
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
env:
|
||||
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
|
||||
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
uses: https://gitea.com/actions/checkout@v4
|
||||
|
||||
- name: Create secrets.tfvars
|
||||
working-directory: terraform
|
||||
run: |
|
||||
cat > secrets.auto.tfvars << EOF
|
||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||
EOF
|
||||
cat > backend.hcl << EOF
|
||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-east-005"
|
||||
endpoints = {
|
||||
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||
}
|
||||
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||
skip_credentials_validation = true
|
||||
skip_metadata_api_check = true
|
||||
skip_region_validation = true
|
||||
skip_requesting_account_id = true
|
||||
use_path_style = true
|
||||
EOF
|
||||
|
||||
- name: Set up Terraform
|
||||
uses: hashicorp/setup-terraform@v2
|
||||
with:
|
||||
terraform_version: 1.6.6
|
||||
|
||||
- name: Inject sensitive secrets
|
||||
working-directory: terraform
|
||||
run: |
|
||||
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: terraform init
|
||||
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||
|
||||
- name: Terraform Plan
|
||||
working-directory: terraform
|
||||
run: terraform plan
|
||||
run: |
|
||||
set -euo pipefail
|
||||
for attempt in 1 2; do
|
||||
echo "Terraform plan attempt $attempt/2"
|
||||
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
|
||||
exit 0
|
||||
fi
|
||||
if [ "$attempt" -eq 1 ]; then
|
||||
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||
sleep 20
|
||||
fi
|
||||
done
|
||||
echo "Terraform plan failed after retries"
|
||||
exit 1
|
||||
|
||||
- name: Block accidental destroy
|
||||
env:
|
||||
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }}
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform show -json -no-color tfplan > tfplan.json
|
||||
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
|
||||
echo "Planned deletes: $DESTROY_COUNT"
|
||||
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
|
||||
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Terraform Apply
|
||||
working-directory: terraform
|
||||
run: terraform apply -auto-approve
|
||||
run: terraform apply -parallelism=1 -auto-approve tfplan
|
||||
|
||||
- name: Create SSH key
|
||||
run: |
|
||||
install -m 0700 -d ~/.ssh
|
||||
KEY_SOURCE=""
|
||||
KEY_CONTENT=""
|
||||
KEY_B64="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE_BASE64 }}")"
|
||||
if [ -n "$KEY_B64" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE_BASE64"
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_B64" | base64 -d)"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")"
|
||||
if [ -n "$KEY_CONTENT" ]; then
|
||||
KEY_SOURCE="SSH_KEY_PRIVATE"
|
||||
else
|
||||
KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")"
|
||||
KEY_SOURCE="KUBEADM_SSH_PRIVATE_KEY"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$KEY_CONTENT" ]; then
|
||||
echo "Missing SSH private key secret. Set SSH_KEY_PRIVATE_BASE64, SSH_KEY_PRIVATE, or KUBEADM_SSH_PRIVATE_KEY."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
KEY_CONTENT="$(printf '%s' "$KEY_CONTENT" | tr -d '\r')"
|
||||
if printf '%s' "$KEY_CONTENT" | grep -q '\\n'; then
|
||||
printf '%b' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
else
|
||||
printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519
|
||||
fi
|
||||
chmod 0600 ~/.ssh/id_ed25519
|
||||
|
||||
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/dev/null 2>&1; then
|
||||
echo "Invalid private key content from $KEY_SOURCE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify SSH keypair match
|
||||
run: |
|
||||
if ! ssh-keygen -y -f ~/.ssh/id_ed25519 >/tmp/key.pub 2>/tmp/key.err; then
|
||||
echo "Invalid private key content in SSH_KEY_PRIVATE/KUBEADM_SSH_PRIVATE_KEY"
|
||||
cat /tmp/key.err
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf '%s\n' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r' > /tmp/secret.pub
|
||||
if ! ssh-keygen -lf /tmp/secret.pub >/tmp/secret.fp 2>/tmp/secret.err; then
|
||||
echo "Invalid SSH_KEY_PUBLIC format"
|
||||
cat /tmp/secret.err
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PRIV_FP="$(ssh-keygen -lf /tmp/key.pub | awk '{print $2}')"
|
||||
PUB_FP="$(awk '{print $2}' /tmp/secret.fp)"
|
||||
|
||||
echo "private fingerprint: $PRIV_FP"
|
||||
echo "public fingerprint: $PUB_FP"
|
||||
|
||||
if [ "$PRIV_FP" != "$PUB_FP" ]; then
|
||||
echo "SSH_KEY_PRIVATE does not match SSH_KEY_PUBLIC. Update secrets with the same keypair."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create kubeadm inventory from Terraform outputs
|
||||
env:
|
||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||
|
||||
- name: Ensure nix and nixos-rebuild
|
||||
env:
|
||||
NIX_CONFIG: experimental-features = nix-command flakes
|
||||
run: |
|
||||
if [ ! -x /nix/var/nix/profiles/default/bin/nix ] && ! command -v nix >/dev/null 2>&1; then
|
||||
if [ "$(id -u)" -eq 0 ]; then
|
||||
mkdir -p /nix
|
||||
chown root:root /nix
|
||||
chmod 0755 /nix
|
||||
|
||||
if ! getent group nixbld >/dev/null 2>&1; then
|
||||
groupadd --system nixbld
|
||||
fi
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
if ! id "nixbld$i" >/dev/null 2>&1; then
|
||||
useradd --system --create-home --home-dir /var/empty --shell /usr/sbin/nologin "nixbld$i"
|
||||
fi
|
||||
usermod -a -G nixbld "nixbld$i"
|
||||
done
|
||||
fi
|
||||
sh <(curl -L https://nixos.org/nix/install) --no-daemon
|
||||
fi
|
||||
|
||||
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||
fi
|
||||
|
||||
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:$PATH"
|
||||
|
||||
nix --version
|
||||
nix profile install nixpkgs#nixos-rebuild
|
||||
|
||||
- name: Rebuild and bootstrap/reconcile kubeadm cluster
|
||||
env:
|
||||
NIX_CONFIG: experimental-features = nix-command flakes
|
||||
FAST_MODE: "1"
|
||||
WORKER_PARALLELISM: "3"
|
||||
REBUILD_TIMEOUT: "45m"
|
||||
REBUILD_RETRIES: "2"
|
||||
run: |
|
||||
if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. "$HOME/.nix-profile/etc/profile.d/nix.sh"
|
||||
elif [ -f "/root/.nix-profile/etc/profile.d/nix.sh" ]; then
|
||||
. /root/.nix-profile/etc/profile.d/nix.sh
|
||||
fi
|
||||
|
||||
export PATH="$HOME/.nix-profile/bin:/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
|
||||
|
||||
./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
|
||||
|
||||
@@ -1,43 +1,126 @@
|
||||
name: Gitea Destroy Terraform
|
||||
run-name: ${{ gitea.actor }} triggered a Terraform Destroy 🧨
|
||||
name: Terraform Destroy
|
||||
run-name: ${{ gitea.actor }} requested Terraform destroy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- destroy
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
confirm:
|
||||
description: "Type NUKE to confirm destroy"
|
||||
required: true
|
||||
type: string
|
||||
target:
|
||||
description: "Destroy scope"
|
||||
required: true
|
||||
default: all
|
||||
type: choice
|
||||
options:
|
||||
- all
|
||||
- control-planes
|
||||
- workers
|
||||
|
||||
concurrency:
|
||||
group: terraform-global
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
destroy:
|
||||
name: "Terraform Destroy"
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
env:
|
||||
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
|
||||
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
|
||||
|
||||
steps:
|
||||
- name: Validate confirmation phrase
|
||||
run: |
|
||||
if [ "${{ inputs.confirm }}" != "NUKE" ]; then
|
||||
echo "Confirmation failed. You must type NUKE."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
uses: https://gitea.com/actions/checkout@v4
|
||||
|
||||
- name: Create Terraform secret files
|
||||
working-directory: terraform
|
||||
run: |
|
||||
cat > secrets.auto.tfvars << EOF
|
||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||
EOF
|
||||
cat > backend.hcl << EOF
|
||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-east-005"
|
||||
endpoints = {
|
||||
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||
}
|
||||
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||
skip_credentials_validation = true
|
||||
skip_metadata_api_check = true
|
||||
skip_region_validation = true
|
||||
skip_requesting_account_id = true
|
||||
use_path_style = true
|
||||
EOF
|
||||
|
||||
- name: Set up Terraform
|
||||
uses: hashicorp/setup-terraform@v2
|
||||
with:
|
||||
terraform_version: 1.6.6
|
||||
|
||||
- name: Inject sensitive secrets
|
||||
working-directory: terraform
|
||||
run: |
|
||||
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: terraform init
|
||||
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||
|
||||
- name: Terraform Destroy
|
||||
- name: Terraform Destroy Plan
|
||||
working-directory: terraform
|
||||
run: terraform destroy -auto-approve
|
||||
run: |
|
||||
set -euo pipefail
|
||||
case "${{ inputs.target }}" in
|
||||
all)
|
||||
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -out=tfdestroy"
|
||||
;;
|
||||
control-planes)
|
||||
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy"
|
||||
;;
|
||||
workers)
|
||||
TF_PLAN_CMD="terraform plan -refresh=false -parallelism=1 -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid destroy target: ${{ inputs.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
for attempt in 1 2; do
|
||||
echo "Terraform destroy plan attempt $attempt/2"
|
||||
if timeout 20m sh -c "$TF_PLAN_CMD"; then
|
||||
exit 0
|
||||
fi
|
||||
if [ "$attempt" -eq 1 ]; then
|
||||
echo "Destroy plan attempt failed or timed out; retrying in 20s"
|
||||
sleep 20
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Terraform destroy plan failed after retries"
|
||||
exit 1
|
||||
|
||||
- name: Terraform Destroy Apply
|
||||
working-directory: terraform
|
||||
run: |
|
||||
set +e
|
||||
terraform apply -auto-approve tfdestroy 2>&1 | tee destroy-apply.log
|
||||
APPLY_EXIT=${PIPESTATUS[0]}
|
||||
|
||||
if [ "$APPLY_EXIT" -ne 0 ] && [ -f errored.tfstate ] && grep -q "Failed to persist state to backend" destroy-apply.log; then
|
||||
echo "Detected backend state write failure after destroy; attempting recovery push..."
|
||||
terraform state push errored.tfstate
|
||||
PUSH_EXIT=$?
|
||||
|
||||
if [ "$PUSH_EXIT" -eq 0 ]; then
|
||||
echo "Recovered by pushing errored.tfstate to backend."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
exit "$APPLY_EXIT"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
name: Gitea Actions Demo
|
||||
run-name: ${{ gitea.actor }} is testing out Gitea Actions 🚀
|
||||
name: Terraform Plan
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -7,38 +6,56 @@ on:
|
||||
- stage
|
||||
- test
|
||||
|
||||
concurrency:
|
||||
group: terraform-global
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
terraform:
|
||||
name: "Terraform Plan"
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
env:
|
||||
TF_VAR_TAILSCALE_KEY: ${{ secrets.TAILSCALE_KEY }}
|
||||
TF_VAR_TS_AUTHKEY: ${{ secrets.TAILSCALE_KEY }}
|
||||
TF_VAR_ssh_key: ${{ secrets.SSH_PUBLIC_KEY }}
|
||||
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
uses: https://gitea.com/actions/checkout@v4
|
||||
|
||||
- name: Create secrets.tfvars
|
||||
working-directory: terraform
|
||||
run: |
|
||||
echo "PM_API_TOKEN_SECRET length: $(echo -n '${{ secrets.PM_API_TOKEN_SECRET }}' | wc -c)"
|
||||
cat > secrets.auto.tfvars << EOF
|
||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||
EOF
|
||||
cat > backend.hcl << EOF
|
||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-east-005"
|
||||
endpoints = {
|
||||
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
|
||||
}
|
||||
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
|
||||
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
|
||||
skip_credentials_validation = true
|
||||
skip_metadata_api_check = true
|
||||
skip_region_validation = true
|
||||
skip_requesting_account_id = true
|
||||
use_path_style = true
|
||||
EOF
|
||||
echo "Created secrets.auto.tfvars:"
|
||||
cat secrets.auto.tfvars | sed 's/=.*/=***/'
|
||||
echo "Using token ID from terraform.tfvars:"
|
||||
grep '^pm_api_token_id' terraform.tfvars
|
||||
|
||||
- name: Set up Terraform
|
||||
uses: hashicorp/setup-terraform@v2
|
||||
with:
|
||||
terraform_version: 1.6.6
|
||||
|
||||
- name: Inject sensitive secrets
|
||||
working-directory: terraform
|
||||
run: |
|
||||
echo 'proxmox_password = "${{ secrets.PROXMOX_PASSWORD }}"' >> terraform.tfvars
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: terraform init
|
||||
run: terraform init -reconfigure -backend-config=backend.hcl
|
||||
|
||||
- name: Terraform Format Check
|
||||
working-directory: terraform
|
||||
@@ -50,11 +67,35 @@ jobs:
|
||||
|
||||
- name: Terraform Plan
|
||||
working-directory: terraform
|
||||
run: terraform plan -out=tfplan
|
||||
run: |
|
||||
set -euo pipefail
|
||||
for attempt in 1 2; do
|
||||
echo "Terraform plan attempt $attempt/2"
|
||||
if timeout 20m terraform plan -refresh=false -parallelism=1 -out=tfplan; then
|
||||
exit 0
|
||||
fi
|
||||
if [ "$attempt" -eq 1 ]; then
|
||||
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||
sleep 20
|
||||
fi
|
||||
done
|
||||
echo "Terraform plan failed after retries"
|
||||
exit 1
|
||||
|
||||
- name: Upload Terraform Plan
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: terraform-plan
|
||||
path: terraform/tfplan
|
||||
- name: Block accidental destroy
|
||||
env:
|
||||
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }}
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform show -json -no-color tfplan > tfplan.json
|
||||
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
|
||||
echo "Planned deletes: $DESTROY_COUNT"
|
||||
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
|
||||
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# NOTE: Disabled artifact upload for now.
|
||||
# On this Gitea/act runner, post-job hooks from artifact actions can
|
||||
# fail during "Complete job" even when all Terraform steps succeeded.
|
||||
# Re-enable once runner/action compatibility is confirmed.
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,2 +1,6 @@
|
||||
./terraform/.terraform
|
||||
terraform/.terraform/
|
||||
terraform/test-apply.sh
|
||||
terraform/test-plan.sh
|
||||
terraform/test-destroy.sh
|
||||
terraform/tfplan
|
||||
|
||||
169
nixos/kubeadm/README.md
Normal file
169
nixos/kubeadm/README.md
Normal file
@@ -0,0 +1,169 @@
|
||||
# Kubeadm Cluster Layout (NixOS)
|
||||
|
||||
This folder defines role-based NixOS configs for a kubeadm cluster.
|
||||
|
||||
## Topology
|
||||
|
||||
- Control planes: `cp-1`, `cp-2`, `cp-3`
|
||||
- Workers: `wk-1`, `wk-2`, `wk-3`
|
||||
|
||||
## What this provides
|
||||
|
||||
- Shared Kubernetes/node prerequisites in `modules/k8s-common.nix`
|
||||
- Shared cluster defaults in `modules/k8s-cluster-settings.nix`
|
||||
- Role-specific settings for control planes and workers
|
||||
- Generated per-node host configs from `flake.nix` (no duplicated host files)
|
||||
- Bootstrap helper commands on each node:
|
||||
- `th-kubeadm-init`
|
||||
- `th-kubeadm-join-control-plane`
|
||||
- `th-kubeadm-join-worker`
|
||||
- `th-kubeadm-status`
|
||||
- A Python bootstrap controller for orchestration:
|
||||
- `bootstrap/controller.py`
|
||||
|
||||
## Layered architecture
|
||||
|
||||
- `terraform/`: VM lifecycle only
|
||||
- `nixos/kubeadm/modules/`: declarative node OS config only
|
||||
- `nixos/kubeadm/bootstrap/controller.py`: imperative cluster reconciliation state machine
|
||||
|
||||
## Hardware config files
|
||||
|
||||
The flake automatically imports `hosts/hardware/<host>.nix` if present.
|
||||
Copy each node's generated hardware config into this folder:
|
||||
|
||||
```bash
|
||||
sudo nixos-generate-config
|
||||
sudo cp /etc/nixos/hardware-configuration.nix ./hosts/hardware/cp-1.nix
|
||||
```
|
||||
|
||||
Repeat for each node (`cp-2`, `cp-3`, `wk-1`, `wk-2`, `wk-3`).
|
||||
|
||||
## Deploy approach
|
||||
|
||||
Start from one node at a time while experimenting:
|
||||
|
||||
```bash
|
||||
sudo nixos-rebuild switch --flake .#cp-1
|
||||
```
|
||||
|
||||
For remote target-host workflows, use your preferred deploy wrapper later
|
||||
(`nixos-rebuild --target-host ...` or deploy-rs/colmena).
|
||||
|
||||
## Bootstrap runbook (kubeadm + kube-vip + Flannel)
|
||||
|
||||
1. Apply Nix config on all nodes (`cp-*`, then `wk-*`).
|
||||
2. On `cp-1`, run:
|
||||
|
||||
```bash
|
||||
sudo th-kubeadm-init
|
||||
```
|
||||
|
||||
This infers the control-plane VIP as `<node-subnet>.250` on `eth0`, creates the
|
||||
kube-vip static pod manifest, and runs `kubeadm init`.
|
||||
|
||||
3. Install Flannel from `cp-1`:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml
|
||||
```
|
||||
|
||||
4. Generate join commands on `cp-1`:
|
||||
|
||||
```bash
|
||||
sudo kubeadm token create --print-join-command
|
||||
sudo kubeadm init phase upload-certs --upload-certs
|
||||
```
|
||||
|
||||
5. Join `cp-2` and `cp-3`:
|
||||
|
||||
```bash
|
||||
sudo th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'
|
||||
```
|
||||
|
||||
6. Join workers:
|
||||
|
||||
```bash
|
||||
sudo th-kubeadm-join-worker '<kubeadm join ...>'
|
||||
```
|
||||
|
||||
7. Validate from a control plane:
|
||||
|
||||
```bash
|
||||
kubectl get nodes -o wide
|
||||
kubectl -n kube-system get pods -o wide
|
||||
```
|
||||
|
||||
## Fresh bootstrap flow (recommended)
|
||||
|
||||
1. Copy and edit inventory:
|
||||
|
||||
```bash
|
||||
cp ./scripts/inventory.example.env ./scripts/inventory.env
|
||||
$EDITOR ./scripts/inventory.env
|
||||
```
|
||||
|
||||
2. Rebuild all nodes and bootstrap a fresh cluster:
|
||||
|
||||
```bash
|
||||
./scripts/rebuild-and-bootstrap.sh
|
||||
```
|
||||
|
||||
Optional tuning env vars:
|
||||
|
||||
```bash
|
||||
FAST_MODE=1 WORKER_PARALLELISM=3 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh
|
||||
```
|
||||
|
||||
- `FAST_MODE=1` skips pre-rebuild remote GC cleanup to reduce wall-clock time.
|
||||
- Set `FAST_MODE=0` for a slower but more aggressive space cleanup pass.
|
||||
|
||||
### Bootstrap controller state
|
||||
|
||||
The controller stores checkpoints in both places:
|
||||
|
||||
- Remote (source of truth): `/var/lib/terrahome/bootstrap-state.json` on `cp-1`
|
||||
- Local copy (workflow/debug artifact): `nixos/kubeadm/bootstrap/bootstrap-state-last.json`
|
||||
|
||||
This makes retries resumable and keeps failure context visible from CI.
|
||||
|
||||
3. If you only want to reset Kubernetes state on existing VMs:
|
||||
|
||||
```bash
|
||||
./scripts/reset-cluster-nodes.sh
|
||||
```
|
||||
|
||||
For a full nuke/recreate lifecycle:
|
||||
- run Terraform destroy/apply for VMs first,
|
||||
- then run `./scripts/rebuild-and-bootstrap.sh` again.
|
||||
|
||||
Node lists now come directly from static Terraform outputs, so bootstrap no longer
|
||||
depends on Proxmox guest-agent IP discovery or SSH subnet scanning.
|
||||
|
||||
## Optional Gitea workflow automation
|
||||
|
||||
Primary flow:
|
||||
|
||||
- Push to `master` triggers `.gitea/workflows/terraform-apply.yml`
|
||||
- That workflow now does Terraform apply and then runs a fresh kubeadm bootstrap automatically
|
||||
|
||||
Manual dispatch workflows are available:
|
||||
|
||||
- `.gitea/workflows/kubeadm-bootstrap.yml`
|
||||
- `.gitea/workflows/kubeadm-reset.yml`
|
||||
|
||||
Required repository secrets:
|
||||
|
||||
- Existing Terraform/backend secrets used by current workflows (`B2_*`, `PM_API_TOKEN_SECRET`, `SSH_KEY_PUBLIC`)
|
||||
- SSH private key: prefer `KUBEADM_SSH_PRIVATE_KEY`, fallback to existing `SSH_KEY_PRIVATE`
|
||||
|
||||
Optional secrets:
|
||||
|
||||
- `KUBEADM_SSH_USER` (defaults to `micqdf`)
|
||||
Node IPs are rendered directly from static Terraform outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets or SSH discovery fallbacks.
|
||||
|
||||
## Notes
|
||||
|
||||
- Scripts are intentionally manual-triggered (predictable for homelab bring-up).
|
||||
- If `.250` on the node subnet is already in use, change `controlPlaneVipSuffix`
|
||||
in `modules/k8s-cluster-settings.nix` before bootstrap.
|
||||
446
nixos/kubeadm/bootstrap/controller.py
Executable file
446
nixos/kubeadm/bootstrap/controller.py
Executable file
@@ -0,0 +1,446 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run_local(cmd, check=True, capture=False):
|
||||
if isinstance(cmd, str):
|
||||
shell = True
|
||||
else:
|
||||
shell = False
|
||||
return subprocess.run(
|
||||
cmd,
|
||||
shell=shell,
|
||||
check=check,
|
||||
text=True,
|
||||
capture_output=capture,
|
||||
)
|
||||
|
||||
|
||||
def load_inventory(inventory_file):
|
||||
inventory_file = Path(inventory_file).resolve()
|
||||
if not inventory_file.exists():
|
||||
raise RuntimeError(f"Missing inventory file: {inventory_file}")
|
||||
cmd = (
|
||||
"set -a; "
|
||||
f"source {shlex.quote(str(inventory_file))}; "
|
||||
"python3 - <<'PY'\n"
|
||||
"import json, os\n"
|
||||
"print(json.dumps(dict(os.environ)))\n"
|
||||
"PY"
|
||||
)
|
||||
proc = run_local(["bash", "-lc", cmd], capture=True)
|
||||
env = json.loads(proc.stdout)
|
||||
|
||||
node_ips = {}
|
||||
cp_names = []
|
||||
wk_names = []
|
||||
|
||||
control_planes = env.get("CONTROL_PLANES", "").strip()
|
||||
workers = env.get("WORKERS", "").strip()
|
||||
|
||||
if control_planes:
|
||||
for pair in control_planes.split():
|
||||
name, ip = pair.split("=", 1)
|
||||
node_ips[name] = ip
|
||||
cp_names.append(name)
|
||||
else:
|
||||
for key in sorted(k for k in env if k.startswith("CP_") and k[3:].isdigit()):
|
||||
idx = key.split("_", 1)[1]
|
||||
name = f"cp-{idx}"
|
||||
node_ips[name] = env[key]
|
||||
cp_names.append(name)
|
||||
|
||||
if workers:
|
||||
for pair in workers.split():
|
||||
name, ip = pair.split("=", 1)
|
||||
node_ips[name] = ip
|
||||
wk_names.append(name)
|
||||
else:
|
||||
for key in sorted(k for k in env if k.startswith("WK_") and k[3:].isdigit()):
|
||||
idx = key.split("_", 1)[1]
|
||||
name = f"wk-{idx}"
|
||||
node_ips[name] = env[key]
|
||||
wk_names.append(name)
|
||||
|
||||
if not cp_names or not wk_names:
|
||||
raise RuntimeError("Inventory must include control planes and workers")
|
||||
|
||||
primary_cp = env.get("PRIMARY_CONTROL_PLANE", "cp-1")
|
||||
if primary_cp not in node_ips:
|
||||
primary_cp = cp_names[0]
|
||||
|
||||
return {
|
||||
"env": env,
|
||||
"node_ips": node_ips,
|
||||
"cp_names": cp_names,
|
||||
"wk_names": wk_names,
|
||||
"primary_cp": primary_cp,
|
||||
"inventory_file": str(inventory_file),
|
||||
}
|
||||
|
||||
|
||||
class Controller:
|
||||
def __init__(self, cfg):
|
||||
self.env = cfg["env"]
|
||||
self.node_ips = cfg["node_ips"]
|
||||
self.cp_names = cfg["cp_names"]
|
||||
self.wk_names = cfg["wk_names"]
|
||||
self.primary_cp = cfg["primary_cp"]
|
||||
self.primary_ip = self.node_ips[self.primary_cp]
|
||||
|
||||
self.script_dir = Path(__file__).resolve().parent
|
||||
self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve()
|
||||
|
||||
self.ssh_user = self.env.get("SSH_USER", "micqdf")
|
||||
self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split()
|
||||
self.active_ssh_user = self.ssh_user
|
||||
self.ssh_key = self.env.get("SSH_KEY_PATH", str(Path.home() / ".ssh" / "id_ed25519"))
|
||||
self.ssh_opts = [
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
"-o",
|
||||
"IdentitiesOnly=yes",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=accept-new",
|
||||
"-i",
|
||||
self.ssh_key,
|
||||
]
|
||||
|
||||
self.rebuild_timeout = self.env.get("REBUILD_TIMEOUT", "45m")
|
||||
self.rebuild_retries = int(self.env.get("REBUILD_RETRIES", "2"))
|
||||
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
|
||||
self.fast_mode = self.env.get("FAST_MODE", "1")
|
||||
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
||||
self.force_reinit = True
|
||||
self.ssh_ready_retries = int(self.env.get("SSH_READY_RETRIES", "20"))
|
||||
self.ssh_ready_delay = int(self.env.get("SSH_READY_DELAY_SEC", "15"))
|
||||
|
||||
def log(self, msg):
|
||||
print(f"==> {msg}")
|
||||
|
||||
def _ssh(self, user, ip, cmd, check=True):
|
||||
full = ["ssh", *self.ssh_opts, f"{user}@{ip}", f"bash -lc {shlex.quote(cmd)}"]
|
||||
return run_local(full, check=check, capture=True)
|
||||
|
||||
def detect_user(self, ip):
|
||||
for attempt in range(1, self.ssh_ready_retries + 1):
|
||||
for user in self.ssh_candidates:
|
||||
proc = self._ssh(user, ip, "true", check=False)
|
||||
if proc.returncode == 0:
|
||||
self.active_ssh_user = user
|
||||
self.log(f"Using SSH user '{user}' for {ip}")
|
||||
return
|
||||
if attempt < self.ssh_ready_retries:
|
||||
self.log(
|
||||
f"SSH not ready on {ip} yet; retrying in {self.ssh_ready_delay}s "
|
||||
f"({attempt}/{self.ssh_ready_retries})"
|
||||
)
|
||||
time.sleep(self.ssh_ready_delay)
|
||||
raise RuntimeError(f"Unable to authenticate to {ip} with users: {', '.join(self.ssh_candidates)}")
|
||||
|
||||
def remote(self, ip, cmd, check=True):
|
||||
ordered = [self.active_ssh_user] + [u for u in self.ssh_candidates if u != self.active_ssh_user]
|
||||
last = None
|
||||
for user in ordered:
|
||||
proc = self._ssh(user, ip, cmd, check=False)
|
||||
if proc.returncode == 0:
|
||||
self.active_ssh_user = user
|
||||
return proc
|
||||
if proc.returncode != 255:
|
||||
last = proc
|
||||
break
|
||||
last = proc
|
||||
if check:
|
||||
stdout = (last.stdout or "").strip()
|
||||
stderr = (last.stderr or "").strip()
|
||||
raise RuntimeError(f"Remote command failed on {ip}: {cmd}\n{stdout}\n{stderr}")
|
||||
return last
|
||||
|
||||
def prepare_known_hosts(self):
|
||||
ssh_dir = Path.home() / ".ssh"
|
||||
ssh_dir.mkdir(parents=True, exist_ok=True)
|
||||
(ssh_dir / "known_hosts").touch()
|
||||
run_local(["chmod", "700", str(ssh_dir)])
|
||||
run_local(["chmod", "600", str(ssh_dir / "known_hosts")])
|
||||
for ip in self.node_ips.values():
|
||||
run_local(["ssh-keygen", "-R", ip], check=False)
|
||||
run_local(f"ssh-keyscan -H {shlex.quote(ip)} >> {shlex.quote(str(ssh_dir / 'known_hosts'))}", check=False)
|
||||
|
||||
def prepare_remote_nix(self, ip):
|
||||
self.remote(ip, "sudo mkdir -p /etc/nix")
|
||||
self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi")
|
||||
self.remote(ip, "echo 'trusted-users = root micqdf' | sudo tee -a /etc/nix/nix.conf >/dev/null")
|
||||
self.remote(ip, "sudo systemctl restart nix-daemon 2>/dev/null || true")
|
||||
|
||||
def prepare_remote_kubelet(self, ip):
|
||||
self.remote(ip, "sudo systemctl stop kubelet >/dev/null 2>&1 || true")
|
||||
self.remote(ip, "sudo systemctl disable kubelet >/dev/null 2>&1 || true")
|
||||
self.remote(ip, "sudo systemctl mask kubelet >/dev/null 2>&1 || true")
|
||||
self.remote(ip, "sudo systemctl reset-failed kubelet >/dev/null 2>&1 || true")
|
||||
self.remote(ip, "sudo rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env || true")
|
||||
|
||||
def prepare_remote_space(self, ip):
|
||||
self.remote(ip, "sudo nix-collect-garbage -d || true")
|
||||
self.remote(ip, "sudo nix --extra-experimental-features nix-command store gc || true")
|
||||
self.remote(ip, "sudo rm -rf /tmp/nix* /tmp/nixos-rebuild* || true")
|
||||
|
||||
def rebuild_node_once(self, name, ip):
|
||||
self.detect_user(ip)
|
||||
cmd = [
|
||||
"timeout",
|
||||
self.rebuild_timeout,
|
||||
"nixos-rebuild",
|
||||
"switch",
|
||||
"--flake",
|
||||
f"{self.flake_dir}#{name}",
|
||||
"--target-host",
|
||||
f"{self.active_ssh_user}@{ip}",
|
||||
"--use-remote-sudo",
|
||||
]
|
||||
env = os.environ.copy()
|
||||
env["NIX_SSHOPTS"] = " ".join(self.ssh_opts)
|
||||
proc = subprocess.run(cmd, text=True, env=env)
|
||||
return proc.returncode == 0
|
||||
|
||||
def rebuild_with_retry(self, name, ip):
|
||||
max_attempts = self.rebuild_retries + 1
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
self.log(f"Rebuild attempt {attempt}/{max_attempts} for {name}")
|
||||
if self.rebuild_node_once(name, ip):
|
||||
return
|
||||
if attempt < max_attempts:
|
||||
self.log(f"Rebuild failed for {name}, retrying in 20s")
|
||||
time.sleep(20)
|
||||
raise RuntimeError(f"Rebuild failed permanently for {name}")
|
||||
|
||||
def stage_preflight(self):
|
||||
self.prepare_known_hosts()
|
||||
self.detect_user(self.primary_ip)
|
||||
|
||||
def stage_rebuild(self):
|
||||
if self.skip_rebuild:
|
||||
self.log("Node rebuild already complete")
|
||||
return
|
||||
|
||||
self.detect_user(self.primary_ip)
|
||||
for name in self.cp_names:
|
||||
ip = self.node_ips[name]
|
||||
self.log(f"Preparing and rebuilding {name} ({ip})")
|
||||
self.prepare_remote_nix(ip)
|
||||
self.prepare_remote_kubelet(ip)
|
||||
if self.fast_mode != "1":
|
||||
self.prepare_remote_space(ip)
|
||||
self.rebuild_with_retry(name, ip)
|
||||
|
||||
for name in self.wk_names:
|
||||
ip = self.node_ips[name]
|
||||
self.log(f"Preparing {name} ({ip})")
|
||||
self.prepare_remote_nix(ip)
|
||||
self.prepare_remote_kubelet(ip)
|
||||
if self.fast_mode != "1":
|
||||
self.prepare_remote_space(ip)
|
||||
|
||||
failures = []
|
||||
with ThreadPoolExecutor(max_workers=self.worker_parallelism) as pool:
|
||||
futures = {pool.submit(self.rebuild_with_retry, name, self.node_ips[name]): name for name in self.wk_names}
|
||||
for fut in as_completed(futures):
|
||||
name = futures[fut]
|
||||
try:
|
||||
fut.result()
|
||||
except Exception as exc:
|
||||
failures.append((name, str(exc)))
|
||||
if failures:
|
||||
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
||||
|
||||
def has_admin_conf(self):
|
||||
return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0
|
||||
|
||||
def cluster_ready(self):
|
||||
cmd = "sudo test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get --raw=/readyz >/dev/null 2>&1"
|
||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||
|
||||
def stage_init_primary(self):
|
||||
self.log(f"Initializing primary control plane on {self.primary_cp}")
|
||||
self.remote(self.primary_ip, "sudo th-kubeadm-init")
|
||||
|
||||
def stage_install_cni(self):
|
||||
self.log("Installing Flannel")
|
||||
manifest_path = self.script_dir.parent / "manifests" / "kube-flannel.yml"
|
||||
manifest_b64 = base64.b64encode(manifest_path.read_bytes()).decode()
|
||||
|
||||
self.remote(
|
||||
self.primary_ip,
|
||||
(
|
||||
"sudo mkdir -p /var/lib/terrahome && "
|
||||
f"echo {shlex.quote(manifest_b64)} | base64 -d | sudo tee /var/lib/terrahome/kube-flannel.yml >/dev/null"
|
||||
),
|
||||
)
|
||||
|
||||
self.log("Waiting for API readiness before applying Flannel")
|
||||
ready = False
|
||||
for _ in range(30):
|
||||
if self.cluster_ready():
|
||||
ready = True
|
||||
break
|
||||
time.sleep(10)
|
||||
if not ready:
|
||||
raise RuntimeError("API server did not become ready before Flannel install")
|
||||
|
||||
last_error = None
|
||||
for attempt in range(1, 6):
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f /var/lib/terrahome/kube-flannel.yml",
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
return
|
||||
last_error = (proc.stdout or "") + ("\n" if proc.stdout and proc.stderr else "") + (proc.stderr or "")
|
||||
self.log(f"Flannel apply attempt {attempt}/5 failed; retrying in 15s")
|
||||
time.sleep(15)
|
||||
|
||||
raise RuntimeError(f"Flannel apply failed after retries\n{last_error or ''}")
|
||||
|
||||
def cluster_has_node(self, name):
|
||||
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
|
||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||
|
||||
def build_join_cmds(self):
|
||||
join_cmd = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command",
|
||||
).stdout.strip()
|
||||
cert_key = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm init phase upload-certs --upload-certs | tail -n 1",
|
||||
).stdout.strip()
|
||||
cp_join = f"{join_cmd} --control-plane --certificate-key {cert_key}"
|
||||
return join_cmd, cp_join
|
||||
|
||||
def stage_join_control_planes(self):
|
||||
_, cp_join = self.build_join_cmds()
|
||||
for node in self.cp_names:
|
||||
if node == self.primary_cp:
|
||||
continue
|
||||
if self.cluster_has_node(node):
|
||||
self.log(f"{node} already joined")
|
||||
continue
|
||||
self.log(f"Joining control plane {node}")
|
||||
ip = self.node_ips[node]
|
||||
node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR"
|
||||
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
|
||||
|
||||
def stage_join_workers(self):
|
||||
join_cmd, _ = self.build_join_cmds()
|
||||
for node in self.wk_names:
|
||||
if self.cluster_has_node(node):
|
||||
self.log(f"{node} already joined")
|
||||
continue
|
||||
self.log(f"Joining worker {node}")
|
||||
ip = self.node_ips[node]
|
||||
node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR"
|
||||
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
|
||||
|
||||
def stage_verify(self):
|
||||
self.log("Final node verification")
|
||||
try:
|
||||
self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel rollout status ds/kube-flannel-ds --timeout=10m",
|
||||
)
|
||||
except Exception:
|
||||
self.log("Flannel rollout failed; collecting diagnostics")
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get ds -o wide || true",
|
||||
check=False,
|
||||
)
|
||||
print(proc.stdout)
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o wide || true",
|
||||
check=False,
|
||||
)
|
||||
print(proc.stdout)
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- describe $p ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel describe $p || true; done",
|
||||
check=False,
|
||||
)
|
||||
print(proc.stdout)
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do echo \"--- logs $p kube-flannel ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c kube-flannel --tail=120 || true; echo \"--- logs $p install-cni-plugin ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni-plugin --tail=120 || true; echo \"--- logs $p install-cni ---\"; sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs $p -c install-cni --tail=120 || true; done",
|
||||
check=False,
|
||||
)
|
||||
print(proc.stdout)
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs --tail=120 $p || true; done",
|
||||
check=False,
|
||||
)
|
||||
print(proc.stdout)
|
||||
raise
|
||||
self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf wait --for=condition=Ready nodes --all --timeout=10m",
|
||||
)
|
||||
proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide")
|
||||
print(proc.stdout)
|
||||
|
||||
def reconcile(self):
|
||||
self.stage_preflight()
|
||||
self.stage_rebuild()
|
||||
self.stage_init_primary()
|
||||
self.stage_install_cni()
|
||||
self.stage_join_control_planes()
|
||||
self.stage_join_workers()
|
||||
self.stage_verify()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="TerraHome kubeadm bootstrap controller")
|
||||
parser.add_argument("command", choices=[
|
||||
"reconcile",
|
||||
"preflight",
|
||||
"rebuild",
|
||||
"init-primary",
|
||||
"install-cni",
|
||||
"join-control-planes",
|
||||
"join-workers",
|
||||
"verify",
|
||||
])
|
||||
parser.add_argument("--inventory", default=str(Path(__file__).resolve().parent.parent / "scripts" / "inventory.env"))
|
||||
args = parser.parse_args()
|
||||
|
||||
cfg = load_inventory(args.inventory)
|
||||
ctl = Controller(cfg)
|
||||
|
||||
dispatch = {
|
||||
"reconcile": ctl.reconcile,
|
||||
"preflight": ctl.stage_preflight,
|
||||
"rebuild": ctl.stage_rebuild,
|
||||
"init-primary": ctl.stage_init_primary,
|
||||
"install-cni": ctl.stage_install_cni,
|
||||
"join-control-planes": ctl.stage_join_control_planes,
|
||||
"join-workers": ctl.stage_join_workers,
|
||||
"verify": ctl.stage_verify,
|
||||
}
|
||||
try:
|
||||
dispatch[args.command]()
|
||||
except Exception as exc:
|
||||
print(f"ERROR: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
27
nixos/kubeadm/flake.lock
generated
Normal file
27
nixos/kubeadm/flake.lock
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1767313136,
|
||||
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
77
nixos/kubeadm/flake.nix
Normal file
77
nixos/kubeadm/flake.nix
Normal file
@@ -0,0 +1,77 @@
|
||||
{
|
||||
description = "NixOS kubeadm cluster configs";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
||||
};
|
||||
|
||||
outputs = { nixpkgs, ... }:
|
||||
let
|
||||
system = "x86_64-linux";
|
||||
lib = nixpkgs.lib;
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
nodeNames = [ "cp-1" "cp-2" "cp-3" "wk-1" "wk-2" "wk-3" ];
|
||||
|
||||
mkNode = {
|
||||
name,
|
||||
role,
|
||||
extraModules ? [ ],
|
||||
}:
|
||||
let
|
||||
roleModule = if role == "control-plane" then ./modules/k8s-control-plane.nix else ./modules/k8s-worker.nix;
|
||||
hardwarePath = ./hosts/hardware + "/${name}.nix";
|
||||
in
|
||||
nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
./modules/k8s-cluster-settings.nix
|
||||
./modules/k8s-common.nix
|
||||
roleModule
|
||||
({ lib, ... }: {
|
||||
imports = lib.optional (builtins.pathExists hardwarePath) hardwarePath;
|
||||
networking.hostName = name;
|
||||
system.stateVersion = "25.05";
|
||||
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
|
||||
fileSystems."/" = lib.mkDefault {
|
||||
device = "/dev/disk/by-label/nixos";
|
||||
fsType = "ext4";
|
||||
};
|
||||
})
|
||||
] ++ extraModules;
|
||||
};
|
||||
|
||||
mkNodeByName = name:
|
||||
mkNode {
|
||||
inherit name;
|
||||
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
|
||||
};
|
||||
|
||||
mkEvalCheck = name:
|
||||
let
|
||||
cfg = mkNode {
|
||||
inherit name;
|
||||
role = if lib.hasPrefix "cp-" name then "control-plane" else "worker";
|
||||
extraModules = [
|
||||
({ lib, ... }: {
|
||||
boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ];
|
||||
fileSystems."/" = lib.mkDefault {
|
||||
device = "/dev/disk/by-label/nixos";
|
||||
fsType = "ext4";
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
in
|
||||
pkgs.runCommand "eval-${name}" { } ''
|
||||
cat > "$out" <<'EOF'
|
||||
host=${cfg.config.networking.hostName}
|
||||
role=${if lib.hasPrefix "cp-" name then "control-plane" else "worker"}
|
||||
stateVersion=${cfg.config.system.stateVersion}
|
||||
EOF
|
||||
'';
|
||||
in {
|
||||
nixosConfigurations = lib.genAttrs nodeNames mkNodeByName;
|
||||
|
||||
checks.${system} = lib.genAttrs nodeNames mkEvalCheck;
|
||||
};
|
||||
}
|
||||
0
nixos/kubeadm/hosts/hardware/.gitkeep
Normal file
0
nixos/kubeadm/hosts/hardware/.gitkeep
Normal file
212
nixos/kubeadm/manifests/kube-flannel.yml
Normal file
212
nixos/kubeadm/manifests/kube-flannel.yml
Normal file
@@ -0,0 +1,212 @@
|
||||
---
|
||||
kind: Namespace
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kube-flannel
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes/status
|
||||
verbs:
|
||||
- patch
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: flannel
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: flannel
|
||||
namespace: kube-flannel
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
namespace: kube-flannel
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kube-flannel-cfg
|
||||
namespace: kube-flannel
|
||||
labels:
|
||||
tier: node
|
||||
k8s-app: flannel
|
||||
app: flannel
|
||||
data:
|
||||
cni-conf.json: |
|
||||
{
|
||||
"name": "cbr0",
|
||||
"cniVersion": "0.3.1",
|
||||
"plugins": [
|
||||
{
|
||||
"type": "flannel",
|
||||
"delegate": {
|
||||
"hairpinMode": true,
|
||||
"isDefaultGateway": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "portmap",
|
||||
"capabilities": {
|
||||
"portMappings": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
net-conf.json: |
|
||||
{
|
||||
"Network": "10.244.0.0/16",
|
||||
"EnableNFTables": false,
|
||||
"Backend": {
|
||||
"Type": "vxlan"
|
||||
}
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kube-flannel-ds
|
||||
namespace: kube-flannel
|
||||
labels:
|
||||
tier: node
|
||||
app: flannel
|
||||
k8s-app: flannel
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: flannel
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tier: node
|
||||
app: flannel
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
hostNetwork: true
|
||||
priorityClassName: system-node-critical
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
effect: NoSchedule
|
||||
serviceAccountName: flannel
|
||||
initContainers:
|
||||
- name: install-cni-plugin
|
||||
image: docker.io/flannel/flannel-cni-plugin:v1.5.1-flannel1
|
||||
command:
|
||||
- cp
|
||||
args:
|
||||
- -f
|
||||
- /flannel
|
||||
- /opt/cni/bin/flannel
|
||||
volumeMounts:
|
||||
- name: cni-plugin
|
||||
mountPath: /opt/cni/bin
|
||||
- name: install-cni
|
||||
image: docker.io/flannel/flannel:v0.25.5
|
||||
command:
|
||||
- cp
|
||||
args:
|
||||
- -f
|
||||
- /etc/kube-flannel/cni-conf.json
|
||||
- /etc/cni/net.d/10-flannel.conflist
|
||||
volumeMounts:
|
||||
- name: cni
|
||||
mountPath: /etc/cni/net.d
|
||||
- name: flannel-cfg
|
||||
mountPath: /etc/kube-flannel/
|
||||
containers:
|
||||
- name: kube-flannel
|
||||
image: docker.io/flannel/flannel:v0.25.5
|
||||
command:
|
||||
- /opt/bin/flanneld
|
||||
args:
|
||||
- --ip-masq
|
||||
- --kube-subnet-mgr
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
securityContext:
|
||||
privileged: false
|
||||
capabilities:
|
||||
add: ["NET_ADMIN", "NET_RAW"]
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: EVENT_QUEUE_DEPTH
|
||||
value: "5000"
|
||||
volumeMounts:
|
||||
- name: run
|
||||
mountPath: /run/flannel
|
||||
- name: flannel-cfg
|
||||
mountPath: /etc/kube-flannel/
|
||||
- name: xtables-lock
|
||||
mountPath: /run/xtables.lock
|
||||
volumes:
|
||||
- name: run
|
||||
hostPath:
|
||||
path: /run/flannel
|
||||
type: DirectoryOrCreate
|
||||
- name: cni-plugin
|
||||
hostPath:
|
||||
path: /opt/cni/bin
|
||||
type: DirectoryOrCreate
|
||||
- name: cni
|
||||
hostPath:
|
||||
path: /etc/cni/net.d
|
||||
type: DirectoryOrCreate
|
||||
- name: flannel-cfg
|
||||
configMap:
|
||||
name: kube-flannel-cfg
|
||||
- name: xtables-lock
|
||||
hostPath:
|
||||
path: /run/xtables.lock
|
||||
type: FileOrCreate
|
||||
12
nixos/kubeadm/modules/k8s-cluster-settings.nix
Normal file
12
nixos/kubeadm/modules/k8s-cluster-settings.nix
Normal file
@@ -0,0 +1,12 @@
|
||||
{ ... }:
|
||||
|
||||
{
|
||||
terrahome.kubeadm = {
|
||||
k8sMinor = "1.31";
|
||||
controlPlaneInterface = "eth0";
|
||||
controlPlaneVipSuffix = 250;
|
||||
podSubnet = "10.244.0.0/16";
|
||||
serviceSubnet = "10.96.0.0/12";
|
||||
clusterDomain = "cluster.local";
|
||||
};
|
||||
}
|
||||
420
nixos/kubeadm/modules/k8s-common.nix
Normal file
420
nixos/kubeadm/modules/k8s-common.nix
Normal file
@@ -0,0 +1,420 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
|
||||
kubeVipImage = "ghcr.io/kube-vip/kube-vip:v0.8.9";
|
||||
in
|
||||
{
|
||||
options.terrahome.kubeadm = {
|
||||
k8sMinor = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "1.31";
|
||||
};
|
||||
|
||||
controlPlaneInterface = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "eth0";
|
||||
};
|
||||
|
||||
controlPlaneVipSuffix = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 250;
|
||||
};
|
||||
|
||||
podSubnet = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "10.244.0.0/16";
|
||||
};
|
||||
|
||||
serviceSubnet = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "10.96.0.0/12";
|
||||
};
|
||||
|
||||
clusterDomain = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "cluster.local";
|
||||
};
|
||||
};
|
||||
|
||||
config = {
|
||||
boot.kernelModules = [ "overlay" "br_netfilter" ];
|
||||
|
||||
boot.kernel.sysctl = {
|
||||
"net.ipv4.ip_forward" = 1;
|
||||
"net.bridge.bridge-nf-call-iptables" = 1;
|
||||
"net.bridge.bridge-nf-call-ip6tables" = 1;
|
||||
};
|
||||
|
||||
virtualisation.containerd.enable = true;
|
||||
virtualisation.containerd.settings = {
|
||||
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
|
||||
};
|
||||
|
||||
swapDevices = lib.mkForce [ ];
|
||||
|
||||
services.openssh.enable = true;
|
||||
services.openssh.settings = {
|
||||
PasswordAuthentication = false;
|
||||
KbdInteractiveAuthentication = false;
|
||||
};
|
||||
|
||||
users.users.micqdf = {
|
||||
isNormalUser = true;
|
||||
extraGroups = [ "wheel" ];
|
||||
};
|
||||
|
||||
security.sudo.wheelNeedsPassword = false;
|
||||
|
||||
nix.settings.trusted-users = [ "root" "micqdf" ];
|
||||
nix.gc = {
|
||||
automatic = true;
|
||||
dates = "daily";
|
||||
options = "--delete-older-than 3d";
|
||||
};
|
||||
nix.settings.auto-optimise-store = true;
|
||||
|
||||
environment.variables = {
|
||||
KUBECONFIG = "/etc/kubernetes/admin.conf";
|
||||
KUBE_VIP_IMAGE = kubeVipImage;
|
||||
};
|
||||
|
||||
environment.systemPackages = (with pkgs; [
|
||||
containerd
|
||||
cri-tools
|
||||
cni-plugins
|
||||
pinnedK8s
|
||||
kubernetes-helm
|
||||
conntrack-tools
|
||||
socat
|
||||
ethtool
|
||||
ipvsadm
|
||||
iproute2
|
||||
iptables
|
||||
ebtables
|
||||
jq
|
||||
curl
|
||||
vim
|
||||
gawk
|
||||
]) ++ [
|
||||
(pkgs.writeShellScriptBin "th-kubeadm-init" ''
|
||||
set -euo pipefail
|
||||
|
||||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||
|
||||
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
|
||||
if ! ip link show "$iface" >/dev/null 2>&1; then
|
||||
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
|
||||
fi
|
||||
|
||||
if [ -z "''${iface:-}" ]; then
|
||||
echo "Could not determine network interface for kube-vip"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
|
||||
pod_subnet="${config.terrahome.kubeadm.podSubnet}"
|
||||
service_subnet="${config.terrahome.kubeadm.serviceSubnet}"
|
||||
domain="${config.terrahome.kubeadm.clusterDomain}"
|
||||
node_name="${config.networking.hostName}"
|
||||
|
||||
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
|
||||
if [ -z "''${local_ip_cidr:-}" ]; then
|
||||
echo "Could not determine IPv4 CIDR on interface $iface"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
|
||||
vip="$subnet_prefix.$suffix"
|
||||
|
||||
echo "Using control-plane endpoint: $vip:6443"
|
||||
echo "Using kube-vip interface: $iface"
|
||||
echo "Using kubeadm node name: $node_name"
|
||||
|
||||
hostname "$node_name" || true
|
||||
|
||||
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||
|
||||
systemctl unmask kubelet || true
|
||||
systemctl stop kubelet || true
|
||||
systemctl reset-failed kubelet || true
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||
rm -rf /var/lib/kubelet/pki
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl unmask kubelet || true
|
||||
systemctl enable kubelet || true
|
||||
|
||||
echo "==> Ensuring containerd is running"
|
||||
systemctl start containerd || true
|
||||
sleep 2
|
||||
if ! systemctl is-active containerd; then
|
||||
echo "ERROR: containerd not running"
|
||||
journalctl -xeu containerd --no-pager -n 30
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p /etc/kubernetes/manifests
|
||||
mkdir -p /tmp/kubeadm
|
||||
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
|
||||
apiVersion: kubeadm.k8s.io/v1beta4
|
||||
kind: InitConfiguration
|
||||
nodeRegistration:
|
||||
name: "KUBEADM_NODE_NAME"
|
||||
criSocket: unix:///run/containerd/containerd.sock
|
||||
kubeletExtraArgs:
|
||||
- name: hostname-override
|
||||
value: "KUBEADM_NODE_NAME"
|
||||
---
|
||||
apiVersion: kubeadm.k8s.io/v1beta4
|
||||
kind: ClusterConfiguration
|
||||
controlPlaneEndpoint: "KUBEADM_ENDPOINT"
|
||||
networking:
|
||||
podSubnet: "KUBEADM_POD_SUBNET"
|
||||
serviceSubnet: "KUBEADM_SERVICE_SUBNET"
|
||||
dnsDomain: "KUBEADM_DNS_DOMAIN"
|
||||
KUBEADMCONFIG
|
||||
|
||||
sed -i "s|KUBEADM_ENDPOINT|$vip:6443|g" /tmp/kubeadm/init-config.yaml
|
||||
sed -i "s|KUBEADM_POD_SUBNET|$pod_subnet|g" /tmp/kubeadm/init-config.yaml
|
||||
sed -i "s|KUBEADM_SERVICE_SUBNET|$service_subnet|g" /tmp/kubeadm/init-config.yaml
|
||||
sed -i "s|KUBEADM_DNS_DOMAIN|$domain|g" /tmp/kubeadm/init-config.yaml
|
||||
sed -i "s|KUBEADM_NODE_NAME|$node_name|g" /tmp/kubeadm/init-config.yaml
|
||||
|
||||
echo "==> Pre-pulling kubeadm images"
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
|
||||
|
||||
echo "==> Creating kube-vip static pod manifest"
|
||||
ctr image pull "${kubeVipImage}"
|
||||
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
||||
--log 4 \
|
||||
--interface "$iface" \
|
||||
--address "$vip" \
|
||||
--controlplane \
|
||||
--arp \
|
||||
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||
|
||||
# kube-vip bootstrap workaround for Kubernetes >=1.29.
|
||||
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
|
||||
sed -i 's#path: /etc/kubernetes/admin.conf#path: /etc/kubernetes/super-admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||
echo "==> kube-vip manifest kubeconfig mount"
|
||||
grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true
|
||||
|
||||
KUBEADM_INIT_LOG=/tmp/kubeadm-init.log
|
||||
if ! env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
|
||||
--config /tmp/kubeadm/init-config.yaml \
|
||||
--upload-certs \
|
||||
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
|
||||
if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
|
||||
echo "==> kubeadm hit CRISocket race; waiting for node registration"
|
||||
echo "==> forcing kubelet restart to pick bootstrap flags"
|
||||
systemctl daemon-reload || true
|
||||
systemctl restart kubelet || true
|
||||
sleep 3
|
||||
echo "==> kubelet bootstrap flags"
|
||||
cat /var/lib/kubelet/kubeadm-flags.env || true
|
||||
registered=0
|
||||
for i in $(seq 1 60); do
|
||||
if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
|
||||
echo "==> node $node_name registered; uploading kubelet config"
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init phase upload-config kubelet --config /tmp/kubeadm/init-config.yaml
|
||||
registered=1
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
if [ "$registered" -ne 1 ]; then
|
||||
echo "==> node $node_name did not register after kubeadm init failure"
|
||||
KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
|
||||
echo "==> kubelet logs (registration hints)"
|
||||
journalctl -u kubelet --no-pager -n 120 | grep -Ei "register|node|bootstrap|certificate|forbidden|unauthorized|refused|x509" || true
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "==> kubeadm init failed, checking pod status:"
|
||||
crictl pods || true
|
||||
crictl ps -a || true
|
||||
echo "==> kube-vip containers:"
|
||||
crictl ps -a --name kube-vip || true
|
||||
echo "==> kube-vip logs:"
|
||||
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||
echo "--- kube-vip container $container_id ---"
|
||||
crictl logs "$container_id" 2>/dev/null || true
|
||||
crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true
|
||||
done
|
||||
echo "==> Checking if VIP is bound:"
|
||||
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
|
||||
echo "==> kubelet logs:"
|
||||
journalctl -xeu kubelet --no-pager -n 50
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "==> Waiting for kube-vip to claim VIP $vip"
|
||||
for i in $(seq 1 90); do
|
||||
if ip -4 addr show | grep -q "$vip"; then
|
||||
echo "==> VIP $vip is bound"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 90 ]; then
|
||||
echo "==> ERROR: VIP not bound after 3 minutes"
|
||||
crictl ps -a --name kube-vip || true
|
||||
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||
echo "--- kube-vip container $container_id ---"
|
||||
crictl logs "$container_id" 2>/dev/null || true
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "==> Waiting for API server to be ready"
|
||||
for i in $(seq 1 60); do
|
||||
if curl -sk "https://$vip:6443/healthz" 2>/dev/null | grep -q "ok"; then
|
||||
echo "==> API server is healthy"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 60 ]; then
|
||||
echo "==> ERROR: API server not healthy after 2 minutes"
|
||||
crictl pods || true
|
||||
crictl ps -a || true
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Switch kube-vip to normal admin.conf after bootstrap finishes.
|
||||
sed -i 's#path: /etc/kubernetes/super-admin.conf#path: /etc/kubernetes/admin.conf#' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||
|
||||
mkdir -p /root/.kube
|
||||
cp /etc/kubernetes/admin.conf /root/.kube/config
|
||||
chmod 600 /root/.kube/config
|
||||
|
||||
echo
|
||||
echo "Next: install Cilium, then generate join commands:"
|
||||
echo " kubeadm token create --print-join-command"
|
||||
echo " kubeadm token create --print-join-command --certificate-key <key>"
|
||||
'')
|
||||
|
||||
(pkgs.writeShellScriptBin "th-kubeadm-join-control-plane" ''
|
||||
set -euo pipefail
|
||||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||
if [ "$#" -lt 1 ]; then
|
||||
echo "Usage: th-kubeadm-join-control-plane '<kubeadm join ... --control-plane --certificate-key ...>'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
iface="${config.terrahome.kubeadm.controlPlaneInterface}"
|
||||
if ! ip link show "$iface" >/dev/null 2>&1; then
|
||||
iface="$(ip -o -4 route show to default | awk 'NR==1 {print $5}')"
|
||||
fi
|
||||
|
||||
if [ -z "''${iface:-}" ]; then
|
||||
echo "Could not determine network interface for kube-vip"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
suffix="${toString config.terrahome.kubeadm.controlPlaneVipSuffix}"
|
||||
local_ip_cidr=$(ip -4 -o addr show dev "$iface" | awk 'NR==1 {print $4}')
|
||||
if [ -z "''${local_ip_cidr:-}" ]; then
|
||||
echo "Could not determine IPv4 CIDR on interface $iface"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
subnet_prefix=$(echo "$local_ip_cidr" | cut -d/ -f1 | awk -F. '{print $1"."$2"."$3}')
|
||||
vip="$subnet_prefix.$suffix"
|
||||
|
||||
mkdir -p /etc/kubernetes/manifests
|
||||
ctr image pull "${kubeVipImage}"
|
||||
ctr run --rm --net-host "${kubeVipImage}" kube-vip /kube-vip manifest pod \
|
||||
--log 4 \
|
||||
--interface "$iface" \
|
||||
--address "$vip" \
|
||||
--controlplane \
|
||||
--arp \
|
||||
--leaderElection \
|
||||
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||
|
||||
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||
rm -rf /var/lib/kubelet/pki
|
||||
|
||||
systemctl unmask kubelet || true
|
||||
systemctl stop kubelet || true
|
||||
systemctl enable kubelet || true
|
||||
systemctl reset-failed kubelet || true
|
||||
systemctl daemon-reload
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||
eval "$1"
|
||||
'')
|
||||
|
||||
(pkgs.writeShellScriptBin "th-kubeadm-join-worker" ''
|
||||
set -euo pipefail
|
||||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY no_proxy NO_PROXY
|
||||
if [ "$#" -lt 1 ]; then
|
||||
echo "Usage: th-kubeadm-join-worker '<kubeadm join ...>'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||
rm -f /etc/kubernetes/kubelet.conf /etc/kubernetes/bootstrap-kubelet.conf
|
||||
rm -f /var/lib/kubelet/kubeconfig /var/lib/kubelet/instance-config.yaml
|
||||
rm -rf /var/lib/kubelet/pki
|
||||
|
||||
systemctl unmask kubelet || true
|
||||
systemctl stop kubelet || true
|
||||
systemctl enable kubelet || true
|
||||
systemctl reset-failed kubelet || true
|
||||
systemctl daemon-reload
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm reset -f || true
|
||||
eval "$1"
|
||||
'')
|
||||
|
||||
(pkgs.writeShellScriptBin "th-kubeadm-status" ''
|
||||
set -euo pipefail
|
||||
systemctl is-active containerd || true
|
||||
systemctl is-active kubelet || true
|
||||
crictl info >/dev/null && echo "crictl: ok" || echo "crictl: not-ready"
|
||||
'')
|
||||
];
|
||||
|
||||
systemd.services.kubelet = {
|
||||
description = "Kubernetes Kubelet";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
path = [ pkgs.util-linux ];
|
||||
wants = [ "network-online.target" ];
|
||||
after = [ "containerd.service" "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Environment = [
|
||||
"KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
|
||||
"KUBELET_KUBEADM_ARGS="
|
||||
"KUBELET_EXTRA_ARGS="
|
||||
];
|
||||
EnvironmentFile = [
|
||||
"-/var/lib/kubelet/kubeadm-flags.env"
|
||||
"-/etc/default/kubelet"
|
||||
];
|
||||
ExecStart = "${pinnedK8s}/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf \$KUBELET_CONFIG_ARGS \$KUBELET_KUBEADM_ARGS \$KUBELET_EXTRA_ARGS";
|
||||
Restart = "on-failure";
|
||||
RestartSec = "10";
|
||||
};
|
||||
unitConfig = {
|
||||
ConditionPathExists = "/var/lib/kubelet/config.yaml";
|
||||
ConditionPathExistsGlob = "/etc/kubernetes/*kubelet.conf";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /etc/kubernetes 0755 root root -"
|
||||
"d /etc/kubernetes/manifests 0755 root root -"
|
||||
"d /etc/cni/net.d 0755 root root -"
|
||||
"d /opt/cni/bin 0755 root root -"
|
||||
"d /run/flannel 0755 root root -"
|
||||
"d /var/lib/kubelet 0755 root root -"
|
||||
"d /var/lib/kubelet/pki 0755 root root -"
|
||||
];
|
||||
};
|
||||
}
|
||||
14
nixos/kubeadm/modules/k8s-control-plane.nix
Normal file
14
nixos/kubeadm/modules/k8s-control-plane.nix
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
6443
|
||||
2379
|
||||
2380
|
||||
10250
|
||||
10257
|
||||
10259
|
||||
];
|
||||
|
||||
networking.firewall.allowedUDPPorts = [
|
||||
8472
|
||||
];
|
||||
}
|
||||
11
nixos/kubeadm/modules/k8s-worker.nix
Normal file
11
nixos/kubeadm/modules/k8s-worker.nix
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
10250
|
||||
30000
|
||||
32767
|
||||
];
|
||||
|
||||
networking.firewall.allowedUDPPorts = [
|
||||
8472
|
||||
];
|
||||
}
|
||||
182
nixos/kubeadm/scripts/discover-inventory-from-ssh.py
Executable file
182
nixos/kubeadm/scripts/discover-inventory-from-ssh.py
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import concurrent.futures
|
||||
import ipaddress
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Dict, Set, Tuple
|
||||
|
||||
|
||||
def derive_prefix(payload: dict) -> str:
|
||||
explicit = os.environ.get("KUBEADM_SUBNET_PREFIX", "").strip()
|
||||
if explicit:
|
||||
return explicit
|
||||
|
||||
for key in ("control_plane_vm_ipv4", "worker_vm_ipv4"):
|
||||
values = payload.get(key, {}).get("value", {})
|
||||
for ip in values.values():
|
||||
if ip:
|
||||
parts = ip.split(".")
|
||||
if len(parts) == 4:
|
||||
return ".".join(parts[:3])
|
||||
|
||||
return "10.27.27"
|
||||
|
||||
|
||||
def ssh_probe(ip: str, users: list[str], key_path: str, timeout_sec: int) -> Tuple[str, str, str] | None:
|
||||
cmd_tail = [
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
"-o",
|
||||
"IdentitiesOnly=yes",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=accept-new",
|
||||
"-o",
|
||||
f"ConnectTimeout={timeout_sec}",
|
||||
"-i",
|
||||
key_path,
|
||||
]
|
||||
for user in users:
|
||||
cmd = [
|
||||
"ssh",
|
||||
*cmd_tail,
|
||||
f"{user}@{ip}",
|
||||
"hn=$(hostnamectl --static 2>/dev/null || hostname); serial=$(cat /sys/class/dmi/id/product_serial 2>/dev/null || true); printf '%s|%s\n' \"$hn\" \"$serial\"",
|
||||
]
|
||||
try:
|
||||
out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=timeout_sec + 2).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if out:
|
||||
line = out.splitlines()[0].strip()
|
||||
if "|" in line:
|
||||
host, serial = line.split("|", 1)
|
||||
else:
|
||||
host, serial = line, ""
|
||||
return host.strip(), ip, serial.strip()
|
||||
return None
|
||||
|
||||
|
||||
def build_inventory(names: Set[str], found: Dict[str, str], ssh_user: str) -> str:
|
||||
cp = sorted([n for n in names if n.startswith("cp-")], key=lambda x: int(x.split("-")[1]))
|
||||
wk = sorted([n for n in names if n.startswith("wk-")], key=lambda x: int(x.split("-")[1]))
|
||||
|
||||
cp_pairs = " ".join(f"{n}={found[n]}" for n in cp)
|
||||
wk_pairs = " ".join(f"{n}={found[n]}" for n in wk)
|
||||
primary = cp[0] if cp else "cp-1"
|
||||
|
||||
return "\n".join(
|
||||
[
|
||||
f"SSH_USER={ssh_user}",
|
||||
f"PRIMARY_CONTROL_PLANE={primary}",
|
||||
f'CONTROL_PLANES="{cp_pairs}"',
|
||||
f'WORKERS="{wk_pairs}"',
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload = json.load(sys.stdin)
|
||||
|
||||
cp_names = set(payload.get("control_plane_vm_ids", {}).get("value", {}).keys())
|
||||
wk_names = set(payload.get("worker_vm_ids", {}).get("value", {}).keys())
|
||||
target_names = cp_names | wk_names
|
||||
if not target_names:
|
||||
raise SystemExit("Could not determine target node names from Terraform outputs")
|
||||
|
||||
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
||||
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
|
||||
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
|
||||
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6"))
|
||||
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32"))
|
||||
|
||||
prefix = derive_prefix(payload)
|
||||
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
|
||||
end = int(os.environ.get("KUBEADM_SUBNET_END", "254"))
|
||||
vip_suffix = int(os.environ.get("KUBEADM_CONTROL_PLANE_VIP_SUFFIX", "250"))
|
||||
|
||||
def is_vip_ip(ip: str) -> bool:
|
||||
try:
|
||||
return int(ip.split(".")[-1]) == vip_suffix
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
scan_ips = [
|
||||
str(ipaddress.IPv4Address(f"{prefix}.{i}"))
|
||||
for i in range(start, end + 1)
|
||||
if i != vip_suffix
|
||||
]
|
||||
found: Dict[str, str] = {}
|
||||
vmid_to_name: Dict[str, str] = {}
|
||||
for name, vmid in payload.get("control_plane_vm_ids", {}).get("value", {}).items():
|
||||
vmid_to_name[str(vmid)] = name
|
||||
for name, vmid in payload.get("worker_vm_ids", {}).get("value", {}).items():
|
||||
vmid_to_name[str(vmid)] = name
|
||||
|
||||
seen_hostnames: Dict[str, str] = {}
|
||||
seen_ips: Dict[str, Tuple[str, str]] = {}
|
||||
|
||||
def run_pass(pass_timeout: int, pass_workers: int) -> None:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool:
|
||||
futures = [pool.submit(ssh_probe, ip, users, key_path, pass_timeout) for ip in scan_ips]
|
||||
for fut in concurrent.futures.as_completed(futures):
|
||||
result = fut.result()
|
||||
if not result:
|
||||
continue
|
||||
host, ip, serial = result
|
||||
if host not in seen_hostnames:
|
||||
seen_hostnames[host] = ip
|
||||
if ip not in seen_ips:
|
||||
seen_ips[ip] = (host, serial)
|
||||
target = None
|
||||
if serial in vmid_to_name:
|
||||
inferred = vmid_to_name[serial]
|
||||
target = inferred
|
||||
elif host in target_names:
|
||||
target = host
|
||||
|
||||
if target:
|
||||
existing = found.get(target)
|
||||
if existing is None or (is_vip_ip(existing) and not is_vip_ip(ip)):
|
||||
found[target] = ip
|
||||
if all(name in found for name in target_names):
|
||||
return
|
||||
|
||||
run_pass(timeout_sec, max_workers)
|
||||
if not all(name in found for name in target_names):
|
||||
# Slower second pass for busy runners/networks.
|
||||
run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2))
|
||||
|
||||
# Heuristic fallback: if nodes still missing, assign from remaining SSH-reachable
|
||||
# IPs not already used, ordered by IP. This helps when cloned nodes temporarily
|
||||
# share a generic hostname (e.g. "flex") and DMI serial mapping is unavailable.
|
||||
missing = sorted([n for n in target_names if n not in found])
|
||||
if missing:
|
||||
used_ips = set(found.values())
|
||||
candidates = sorted(ip for ip in seen_ips.keys() if ip not in used_ips)
|
||||
if len(candidates) >= len(missing):
|
||||
for name, ip in zip(missing, candidates):
|
||||
found[name] = ip
|
||||
|
||||
missing = sorted([n for n in target_names if n not in found])
|
||||
if missing:
|
||||
discovered = ", ".join(sorted(seen_hostnames.keys())[:20])
|
||||
if discovered:
|
||||
sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n")
|
||||
if seen_ips:
|
||||
sample = ", ".join(f"{ip}={meta[0]}" for ip, meta in list(sorted(seen_ips.items()))[:20])
|
||||
sys.stderr.write(f"SSH-reachable IPs: {sample}\n")
|
||||
raise SystemExit(
|
||||
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
|
||||
f" (scanned {prefix}.{start}-{prefix}.{end})"
|
||||
)
|
||||
|
||||
sys.stdout.write(build_inventory(target_names, found, ssh_user))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
7
nixos/kubeadm/scripts/inventory.example.env
Normal file
7
nixos/kubeadm/scripts/inventory.example.env
Normal file
@@ -0,0 +1,7 @@
|
||||
SSH_USER=micqdf
|
||||
PRIMARY_CONTROL_PLANE=cp-1
|
||||
|
||||
# Name=IP pairs (space-separated)
|
||||
CONTROL_PLANES="cp-1=192.168.1.101 cp-2=192.168.1.102 cp-3=192.168.1.103"
|
||||
|
||||
WORKERS="wk-1=192.168.1.111 wk-2=192.168.1.112 wk-3=192.168.1.113"
|
||||
14
nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
Executable file
14
nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
|
||||
CONTROLLER="$SCRIPT_DIR/../bootstrap/controller.py"
|
||||
|
||||
if [ ! -f "$INVENTORY_FILE" ]; then
|
||||
echo "Missing inventory file: $INVENTORY_FILE"
|
||||
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 "$CONTROLLER" reconcile --inventory "$INVENTORY_FILE"
|
||||
65
nixos/kubeadm/scripts/render-inventory-from-tf-output.py
Executable file
65
nixos/kubeadm/scripts/render-inventory-from-tf-output.py
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def natural_key(name: str):
|
||||
m = re.match(r"^([a-zA-Z-]+)-(\d+)$", name)
|
||||
if m:
|
||||
return (m.group(1), int(m.group(2)))
|
||||
return (name, 0)
|
||||
|
||||
|
||||
def map_to_pairs(items: dict[str, str]) -> str:
|
||||
ordered = sorted(items.items(), key=lambda kv: natural_key(kv[0]))
|
||||
return " ".join(f"{k}={v}" for k, v in ordered)
|
||||
|
||||
|
||||
def require_non_empty_ips(label: str, items: dict[str, str]) -> dict[str, str]:
|
||||
cleaned: dict[str, str] = {}
|
||||
missing: list[str] = []
|
||||
|
||||
for name, ip in items.items():
|
||||
ip_value = (ip or "").strip()
|
||||
if not ip_value:
|
||||
missing.append(name)
|
||||
continue
|
||||
cleaned[name] = ip_value
|
||||
|
||||
if missing:
|
||||
names = ", ".join(sorted(missing, key=natural_key))
|
||||
raise SystemExit(
|
||||
f"Missing IPv4 addresses for {label}: {names}. "
|
||||
"Terraform outputs are present but empty. "
|
||||
"This usually means Proxmox guest IP discovery is unavailable for these VMs yet."
|
||||
)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def main() -> int:
|
||||
payload = json.load(sys.stdin)
|
||||
|
||||
cp_map = payload.get("control_plane_vm_ipv4", {}).get("value", {})
|
||||
wk_map = payload.get("worker_vm_ipv4", {}).get("value", {})
|
||||
|
||||
if not cp_map or not wk_map:
|
||||
raise SystemExit("Missing control_plane_vm_ipv4 or worker_vm_ipv4 in terraform output")
|
||||
|
||||
cp_map = require_non_empty_ips("control planes", cp_map)
|
||||
wk_map = require_non_empty_ips("workers", wk_map)
|
||||
|
||||
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
||||
|
||||
print(f"SSH_USER={ssh_user}")
|
||||
print("PRIMARY_CONTROL_PLANE=cp-1")
|
||||
print(f"CONTROL_PLANES=\"{map_to_pairs(cp_map)}\"")
|
||||
print(f"WORKERS=\"{map_to_pairs(wk_map)}\"")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
106
nixos/kubeadm/scripts/reset-cluster-nodes.sh
Executable file
106
nixos/kubeadm/scripts/reset-cluster-nodes.sh
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
|
||||
|
||||
if [ ! -f "$INVENTORY_FILE" ]; then
|
||||
echo "Missing inventory file: $INVENTORY_FILE"
|
||||
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1090
|
||||
source "$INVENTORY_FILE"
|
||||
|
||||
SSH_USER="${SSH_USER:-micqdf}"
|
||||
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
|
||||
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
|
||||
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
|
||||
|
||||
declare -A NODE_IPS=()
|
||||
|
||||
add_pair() {
|
||||
local pair="$1"
|
||||
local name="${pair%%=*}"
|
||||
local ip="${pair#*=}"
|
||||
|
||||
if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then
|
||||
echo "Invalid node pair '$pair' (expected name=ip)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NODE_IPS["$name"]="$ip"
|
||||
}
|
||||
|
||||
if [ -n "${CONTROL_PLANES:-}" ]; then
|
||||
for pair in $CONTROL_PLANES; do
|
||||
add_pair "$pair"
|
||||
done
|
||||
else
|
||||
while IFS= read -r var_name; do
|
||||
idx="${var_name#CP_}"
|
||||
add_pair "cp-$idx=${!var_name}"
|
||||
done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V)
|
||||
fi
|
||||
|
||||
if [ -n "${WORKERS:-}" ]; then
|
||||
for pair in $WORKERS; do
|
||||
add_pair "$pair"
|
||||
done
|
||||
else
|
||||
while IFS= read -r var_name; do
|
||||
idx="${var_name#WK_}"
|
||||
add_pair "wk-$idx=${!var_name}"
|
||||
done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V)
|
||||
fi
|
||||
|
||||
if [ "${#NODE_IPS[@]}" -eq 0 ]; then
|
||||
echo "No nodes found in inventory."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
detect_ssh_user() {
|
||||
local probe_ip="$1"
|
||||
local candidate
|
||||
|
||||
for candidate in $SSH_USER_CANDIDATES; do
|
||||
if ssh $SSH_OPTS "$candidate@$probe_ip" "true" >/dev/null 2>&1; then
|
||||
ACTIVE_SSH_USER="$candidate"
|
||||
echo "==> Using SSH user '$ACTIVE_SSH_USER'"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Unable to authenticate to $probe_ip with candidates: $SSH_USER_CANDIDATES"
|
||||
return 1
|
||||
}
|
||||
|
||||
mkdir -p "$HOME/.ssh"
|
||||
chmod 700 "$HOME/.ssh"
|
||||
touch "$HOME/.ssh/known_hosts"
|
||||
chmod 600 "$HOME/.ssh/known_hosts"
|
||||
for node_name in "${!NODE_IPS[@]}"; do
|
||||
ssh-keygen -R "${NODE_IPS[$node_name]}" >/dev/null 2>&1 || true
|
||||
ssh-keyscan -H "${NODE_IPS[$node_name]}" >> "$HOME/.ssh/known_hosts" 2>/dev/null || true
|
||||
done
|
||||
|
||||
reset_node() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
echo "==> Resetting $node_name ($node_ip)"
|
||||
local cmd="sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d"
|
||||
local quoted_cmd
|
||||
quoted_cmd="$(printf '%q' "$cmd")"
|
||||
ssh $SSH_OPTS "$ACTIVE_SSH_USER@$node_ip" "bash -lc $quoted_cmd"
|
||||
}
|
||||
|
||||
FIRST_NODE_IP="${NODE_IPS[$(printf '%s\n' "${!NODE_IPS[@]}" | sort -V | head -n1)]}"
|
||||
ACTIVE_SSH_USER="$SSH_USER"
|
||||
detect_ssh_user "$FIRST_NODE_IP"
|
||||
|
||||
while IFS= read -r node_name; do
|
||||
reset_node "$node_name" "${NODE_IPS[$node_name]}"
|
||||
done < <(printf '%s\n' "${!NODE_IPS[@]}" | sort -V)
|
||||
|
||||
echo "Cluster components reset on all listed nodes."
|
||||
27
nixos/template-base/README.md
Normal file
27
nixos/template-base/README.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# NixOS Proxmox k8s-base Template
|
||||
|
||||
This folder contains a Kubernetes-ready NixOS base config for your Proxmox
|
||||
template VM build.
|
||||
|
||||
## Files
|
||||
|
||||
- `flake.nix`: pins `nixos-25.05` and exposes one host config.
|
||||
- `configuration.nix`: k8s-base settings for Proxmox guests.
|
||||
|
||||
## Before first apply
|
||||
|
||||
1. Add `hardware-configuration.nix` from the VM install:
|
||||
- `nixos-generate-config --root /`
|
||||
- copy `/etc/nixos/hardware-configuration.nix` next to `configuration.nix`
|
||||
|
||||
## Build/apply example inside the VM
|
||||
|
||||
```bash
|
||||
sudo nixos-rebuild switch --flake .#template
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- This pre-installs heavy shared Kubernetes dependencies (containerd + kube tools)
|
||||
to reduce per-node bootstrap time.
|
||||
- Cloud-init still injects the runtime SSH key and per-node hostname/IP.
|
||||
99
nixos/template-base/configuration.nix
Normal file
99
nixos/template-base/configuration.nix
Normal file
@@ -0,0 +1,99 @@
|
||||
{ lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
pinnedK8s = lib.attrByPath [ "kubernetes_1_31" ] pkgs.kubernetes pkgs;
|
||||
in
|
||||
|
||||
{
|
||||
imports =
|
||||
lib.optional (builtins.pathExists ./hardware-configuration.nix)
|
||||
./hardware-configuration.nix;
|
||||
|
||||
networking.hostName = "k8s-base-template";
|
||||
networking.useDHCP = false;
|
||||
networking.useNetworkd = true;
|
||||
networking.nameservers = [ "1.1.1.1" "8.8.8.8" ];
|
||||
|
||||
boot.loader.systemd-boot.enable = lib.mkForce false;
|
||||
boot.loader.grub = {
|
||||
enable = true;
|
||||
device = "/dev/sda";
|
||||
};
|
||||
|
||||
services.qemuGuest.enable = true;
|
||||
services.cloud-init.enable = true;
|
||||
services.cloud-init.network.enable = true;
|
||||
services.openssh.enable = true;
|
||||
services.openssh.settings = {
|
||||
PasswordAuthentication = false;
|
||||
KbdInteractiveAuthentication = false;
|
||||
PermitRootLogin = "prohibit-password";
|
||||
};
|
||||
|
||||
boot.kernelModules = [ "overlay" "br_netfilter" ];
|
||||
boot.kernel.sysctl = {
|
||||
"net.ipv4.ip_forward" = 1;
|
||||
"net.bridge.bridge-nf-call-iptables" = 1;
|
||||
"net.bridge.bridge-nf-call-ip6tables" = 1;
|
||||
};
|
||||
|
||||
virtualisation.containerd.enable = true;
|
||||
virtualisation.containerd.settings = {
|
||||
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options.SystemdCgroup = true;
|
||||
};
|
||||
|
||||
swapDevices = lib.mkForce [ ];
|
||||
|
||||
nix.settings = {
|
||||
trusted-users = [ "root" "micqdf" ];
|
||||
auto-optimise-store = true;
|
||||
};
|
||||
|
||||
nix.gc = {
|
||||
automatic = true;
|
||||
dates = "daily";
|
||||
options = "--delete-older-than 3d";
|
||||
};
|
||||
|
||||
programs.fish.enable = true;
|
||||
|
||||
users.users.micqdf = {
|
||||
isNormalUser = true;
|
||||
extraGroups = [ "wheel" ];
|
||||
shell = pkgs.fish;
|
||||
};
|
||||
|
||||
security.sudo.wheelNeedsPassword = false;
|
||||
|
||||
environment.systemPackages = with pkgs; [
|
||||
btop
|
||||
cni-plugins
|
||||
conntrack-tools
|
||||
containerd
|
||||
cri-tools
|
||||
curl
|
||||
dig
|
||||
ebtables
|
||||
ethtool
|
||||
eza
|
||||
fd
|
||||
fzf
|
||||
git
|
||||
htop
|
||||
iproute2
|
||||
iptables
|
||||
ipvsadm
|
||||
jq
|
||||
kubernetes-helm
|
||||
pinnedK8s
|
||||
ripgrep
|
||||
socat
|
||||
tree
|
||||
unzip
|
||||
vim
|
||||
neovim
|
||||
wget
|
||||
];
|
||||
|
||||
system.stateVersion = "25.05";
|
||||
}
|
||||
27
nixos/template-base/flake.lock
generated
Normal file
27
nixos/template-base/flake.lock
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1767313136,
|
||||
"narHash": "sha256-16KkgfdYqjaeRGBaYsNrhPRRENs0qzkQVUooNHtoy2w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ac62194c3917d5f474c1a844b6fd6da2db95077d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
14
nixos/template-base/flake.nix
Normal file
14
nixos/template-base/flake.nix
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
description = "Kubernetes-ready NixOS base template";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
||||
};
|
||||
|
||||
outputs = { nixpkgs, ... }: {
|
||||
nixosConfigurations.template = nixpkgs.lib.nixosSystem {
|
||||
system = "x86_64-linux";
|
||||
modules = [ ./configuration.nix ];
|
||||
};
|
||||
};
|
||||
}
|
||||
89
terraform/.terraform.lock.hcl
generated
89
terraform/.terraform.lock.hcl
generated
@@ -1,79 +1,24 @@
|
||||
# This file is maintained automatically by "terraform init".
|
||||
# Manual edits may be lost in future updates.
|
||||
|
||||
provider "registry.terraform.io/hashicorp/local" {
|
||||
version = "2.5.2"
|
||||
hashes = [
|
||||
"h1:JlMZD6nYqJ8sSrFfEAH0Vk/SL8WLZRmFaMUF9PJK5wM=",
|
||||
"zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511",
|
||||
"zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea",
|
||||
"zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0",
|
||||
"zh:4b8cd2583d1edcac4011caafe8afb7a95e8110a607a1d5fb87d921178074a69b",
|
||||
"zh:52084ddaff8c8cd3f9e7bcb7ce4dc1eab00602912c96da43c29b4762dc376038",
|
||||
"zh:71562d330d3f92d79b2952ffdda0dad167e952e46200c767dd30c6af8d7c0ed3",
|
||||
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
|
||||
"zh:805f81ade06ff68fa8b908d31892eaed5c180ae031c77ad35f82cb7a74b97cf4",
|
||||
"zh:8b6b3ebeaaa8e38dd04e56996abe80db9be6f4c1df75ac3cccc77642899bd464",
|
||||
"zh:ad07750576b99248037b897de71113cc19b1a8d0bc235eb99173cc83d0de3b1b",
|
||||
"zh:b9f1c3bfadb74068f5c205292badb0661e17ac05eb23bfe8bd809691e4583d0e",
|
||||
"zh:cc4cbcd67414fefb111c1bf7ab0bc4beb8c0b553d01719ad17de9a047adff4d1",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/null" {
|
||||
version = "3.2.3"
|
||||
hashes = [
|
||||
"h1:+AnORRgFbRO6qqcfaQyeX80W0eX3VmjadjnUFUJTiXo=",
|
||||
"zh:22d062e5278d872fe7aed834f5577ba0a5afe34a3bdac2b81f828d8d3e6706d2",
|
||||
"zh:23dead00493ad863729495dc212fd6c29b8293e707b055ce5ba21ee453ce552d",
|
||||
"zh:28299accf21763ca1ca144d8f660688d7c2ad0b105b7202554ca60b02a3856d3",
|
||||
"zh:55c9e8a9ac25a7652df8c51a8a9a422bd67d784061b1de2dc9fe6c3cb4e77f2f",
|
||||
"zh:756586535d11698a216291c06b9ed8a5cc6a4ec43eee1ee09ecd5c6a9e297ac1",
|
||||
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
|
||||
"zh:9d5eea62fdb587eeb96a8c4d782459f4e6b73baeece4d04b4a40e44faaee9301",
|
||||
"zh:a6355f596a3fb8fc85c2fb054ab14e722991533f87f928e7169a486462c74670",
|
||||
"zh:b5a65a789cff4ada58a5baffc76cb9767dc26ec6b45c00d2ec8b1b027f6db4ed",
|
||||
"zh:db5ab669cf11d0e9f81dc380a6fdfcac437aea3d69109c7aef1a5426639d2d65",
|
||||
"zh:de655d251c470197bcbb5ac45d289595295acb8f829f6c781d4a75c8c8b7c7dd",
|
||||
"zh:f5c68199f2e6076bce92a12230434782bf768103a427e9bb9abee99b116af7b5",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/template" {
|
||||
version = "2.2.0"
|
||||
hashes = [
|
||||
"h1:94qn780bi1qjrbC3uQtjJh3Wkfwd5+tTtJHOb7KTg9w=",
|
||||
"zh:01702196f0a0492ec07917db7aaa595843d8f171dc195f4c988d2ffca2a06386",
|
||||
"zh:09aae3da826ba3d7df69efeb25d146a1de0d03e951d35019a0f80e4f58c89b53",
|
||||
"zh:09ba83c0625b6fe0a954da6fbd0c355ac0b7f07f86c91a2a97849140fea49603",
|
||||
"zh:0e3a6c8e16f17f19010accd0844187d524580d9fdb0731f675ffcf4afba03d16",
|
||||
"zh:45f2c594b6f2f34ea663704cc72048b212fe7d16fb4cfd959365fa997228a776",
|
||||
"zh:77ea3e5a0446784d77114b5e851c970a3dde1e08fa6de38210b8385d7605d451",
|
||||
"zh:8a154388f3708e3df5a69122a23bdfaf760a523788a5081976b3d5616f7d30ae",
|
||||
"zh:992843002f2db5a11e626b3fc23dc0c87ad3729b3b3cff08e32ffb3df97edbde",
|
||||
"zh:ad906f4cebd3ec5e43d5cd6dc8f4c5c9cc3b33d2243c89c5fc18f97f7277b51d",
|
||||
"zh:c979425ddb256511137ecd093e23283234da0154b7fa8b21c2687182d9aea8b2",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/telmate/proxmox" {
|
||||
version = "3.0.1-rc8"
|
||||
constraints = "3.0.1-rc8"
|
||||
version = "3.0.2-rc07"
|
||||
constraints = "3.0.2-rc07"
|
||||
hashes = [
|
||||
"h1:W5X4T5AZUaqO++aAequNECUKJaXLC5upcws6Vp7mkBk=",
|
||||
"zh:0272f1600251abf9b139c2683f83cde0a907ac762f5ead058b84de18ddc1d78e",
|
||||
"zh:328e708a8063a133516612b17c8983a9372fa42766530925d1d37aeb1daa30ec",
|
||||
"zh:3449150e4d57f79af6f9583e93e3a5ab84fb475bc594de75b968534f57af2871",
|
||||
"zh:58d803a0203241214f673c80350d43ce1a5ce57b21b83ba08d0d08e8c389dcc4",
|
||||
"zh:59e3e99afc1ea404e530100725403c1610d682cfd27eeeaf35190c119b76a4db",
|
||||
"zh:666cb7d299824152714202e8fda000c2e37346f2ae6d0a0e3c6f6bd68ef5d9ca",
|
||||
"zh:6a1290b85e7bf953664b21b2a1ea554923a060f2a8347d8d5bb3d2b5157f85d2",
|
||||
"zh:72230960c49fe7050a5e80ee10fa24cdac94dbab82744bccb6aa251741eb5aa9",
|
||||
"zh:91f655c41f5af9a9fdcf6104c3d0a553eaa0fb3390af81051e744f30accd5b52",
|
||||
"zh:aa08a22bf737d5840573bb6030617ab6bba2a292f4b9c88b20477cdcfb9676a9",
|
||||
"zh:b72012cc284cad488207532b6668c58999c972d837b5f486db1d7466d686d5fd",
|
||||
"zh:e24f934249a6ab4d3705c1398226d4d9df1e81ef8a36592389be02bc35cc661f",
|
||||
"zh:e9e6bcef8b6a6b5ff2317168c2c23e4c55ae23f883ba158d2c4fd6324a0413e5",
|
||||
"zh:ffa1e742a8c50babd8dbfcd6884740f9bea8453ec4d832717ff006a4fbfffa91",
|
||||
"h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=",
|
||||
"zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca",
|
||||
"zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c",
|
||||
"zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd",
|
||||
"zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b",
|
||||
"zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea",
|
||||
"zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb",
|
||||
"zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210",
|
||||
"zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42",
|
||||
"zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae",
|
||||
"zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26",
|
||||
"zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f",
|
||||
"zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2",
|
||||
"zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c",
|
||||
"zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
### Alpaca cloud-init template
|
||||
data "template_file" "cloud_init_alpaca" {
|
||||
count = var.alpaca_vm_count
|
||||
template = file("${path.module}/files/cloud_init.yaml")
|
||||
|
||||
vars = {
|
||||
ssh_key = var.ssh_key
|
||||
hostname = "alpaca-${count.index + 1}"
|
||||
domain = "home.arpa"
|
||||
TS_AUTHKEY = var.TS_AUTHKEY
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resource "local_file" "cloud_init_alpaca" {
|
||||
count = var.alpaca_vm_count
|
||||
content = data.template_file.cloud_init_alpaca[count.index].rendered
|
||||
filename = "${path.module}/files/cloud_init_alpaca_${count.index + 1}.yaml"
|
||||
}
|
||||
|
||||
resource "null_resource" "upload_cloud_init_alpaca" {
|
||||
count = var.alpaca_vm_count
|
||||
|
||||
connection {
|
||||
type = "ssh"
|
||||
user = "root"
|
||||
host = var.target_node
|
||||
}
|
||||
|
||||
provisioner "file" {
|
||||
source = local_file.cloud_init_alpaca[count.index].filename
|
||||
destination = "/var/lib/vz/snippets/cloud_init_alpaca_${count.index + 1}.yaml"
|
||||
}
|
||||
}
|
||||
|
||||
### Llama cloud-init template
|
||||
data "template_file" "cloud_init_llama" {
|
||||
count = var.llama_vm_count
|
||||
template = file("${path.module}/files/cloud_init.yaml")
|
||||
|
||||
vars = {
|
||||
ssh_key = var.ssh_key
|
||||
hostname = "llama-${count.index + 1}"
|
||||
domain = "home.arpa"
|
||||
TS_AUTHKEY = var.TS_AUTHKEY
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resource "local_file" "cloud_init_llama" {
|
||||
count = var.llama_vm_count
|
||||
content = data.template_file.cloud_init_llama[count.index].rendered
|
||||
filename = "${path.module}/files/cloud_init_llama_${count.index + 1}.yaml"
|
||||
}
|
||||
|
||||
resource "null_resource" "upload_cloud_init_llama" {
|
||||
count = var.llama_vm_count
|
||||
|
||||
connection {
|
||||
type = "ssh"
|
||||
user = "root"
|
||||
host = var.target_node
|
||||
}
|
||||
|
||||
provisioner "file" {
|
||||
source = local_file.cloud_init_llama[count.index].filename
|
||||
destination = "/var/lib/vz/snippets/cloud_init_llama_${count.index + 1}.yaml"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#cloud-config
|
||||
hostname: ${hostname}
|
||||
fqdn: ${hostname}.${domain}
|
||||
ssh_authorized_keys:
|
||||
- ${ssh_key}
|
||||
|
||||
runcmd:
|
||||
- curl -fsSL https://tailscale.com/install.sh | sh
|
||||
- tailscale up --auth-key=${TS_AUTHKEY}
|
||||
- tailscale set --ssh
|
||||
|
||||
|
||||
6
terraform/files/cloud_init_base.yaml
Normal file
6
terraform/files/cloud_init_base.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
#cloud-config
|
||||
runcmd:
|
||||
- curl -fsSL https://tailscale.com/install.sh | sh
|
||||
- tailscale up --auth-key=${TS_AUTHKEY}
|
||||
- tailscale set --ssh
|
||||
|
||||
15
terraform/files/cloud_init_global.tpl
Normal file
15
terraform/files/cloud_init_global.tpl
Normal file
@@ -0,0 +1,15 @@
|
||||
#cloud-config
|
||||
hostname: ${hostname}
|
||||
manage_etc_hosts: true
|
||||
resolv_conf:
|
||||
nameservers:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
|
||||
preserve_hostname: false
|
||||
fqdn: ${hostname}.${domain}
|
||||
|
||||
users:
|
||||
- name: micqdf
|
||||
ssh_authorized_keys:
|
||||
- ${SSH_KEY_PUBLIC}
|
||||
@@ -1,42 +1,71 @@
|
||||
terraform {
|
||||
backend "s3" {}
|
||||
|
||||
required_providers {
|
||||
proxmox = {
|
||||
source = "Telmate/proxmox"
|
||||
version = "3.0.1-rc8"
|
||||
version = "3.0.2-rc07"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
locals {
|
||||
control_plane_ipconfig = [
|
||||
for ip in var.control_plane_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||
]
|
||||
worker_ipconfig = [
|
||||
for ip in var.worker_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||
]
|
||||
}
|
||||
|
||||
provider "proxmox" {
|
||||
pm_api_url = var.pm_api_url
|
||||
pm_user = var.pm_user
|
||||
pm_password = var.proxmox_password
|
||||
pm_tls_insecure = true
|
||||
pm_api_url = var.pm_api_url
|
||||
pm_api_token_id = var.pm_api_token_id
|
||||
pm_api_token_secret = var.pm_api_token_secret
|
||||
pm_tls_insecure = true
|
||||
}
|
||||
|
||||
resource "proxmox_vm_qemu" "alpacas" {
|
||||
count = var.alpaca_vm_count
|
||||
name = "alpaca-${count.index + 1}"
|
||||
vmid = 500 + count.index + 1
|
||||
target_node = var.target_node
|
||||
clone = var.clone_template
|
||||
full_clone = false
|
||||
agent = 1
|
||||
resource "proxmox_vm_qemu" "control_planes" {
|
||||
count = var.control_plane_count
|
||||
name = "cp-${count.index + 1}"
|
||||
vmid = var.control_plane_vmid_start + count.index
|
||||
target_node = var.target_node
|
||||
clone = var.clone_template
|
||||
full_clone = true
|
||||
os_type = "cloud-init"
|
||||
agent = var.qemu_agent_enabled ? 1 : 0
|
||||
automatic_reboot = true
|
||||
|
||||
sockets = var.sockets
|
||||
cores = var.cores
|
||||
memory = var.memory
|
||||
scsihw = "virtio-scsi-pci"
|
||||
boot = "order=scsi0"
|
||||
ipconfig0 = "ip=dhcp"
|
||||
cicustom = "user=local:snippets/cloud_init_alpaca_${count.index + 1}.yaml"
|
||||
depends_on = [null_resource.upload_cloud_init_alpaca]
|
||||
cpu {
|
||||
sockets = 1
|
||||
cores = var.control_plane_cores
|
||||
}
|
||||
memory = var.control_plane_memory_mb
|
||||
scsihw = "virtio-scsi-pci"
|
||||
boot = "order=scsi0"
|
||||
bootdisk = "scsi0"
|
||||
ipconfig0 = local.control_plane_ipconfig[count.index]
|
||||
ciuser = "micqdf"
|
||||
sshkeys = var.SSH_KEY_PUBLIC
|
||||
|
||||
disk {
|
||||
slot = "scsi0"
|
||||
type = "disk"
|
||||
storage = var.storage
|
||||
size = var.disk_size
|
||||
|
||||
disks {
|
||||
scsi {
|
||||
scsi0 {
|
||||
disk {
|
||||
size = var.control_plane_disk_size
|
||||
storage = var.storage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ide {
|
||||
ide2 {
|
||||
cloudinit {
|
||||
storage = var.storage
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
network {
|
||||
@@ -44,38 +73,63 @@ resource "proxmox_vm_qemu" "alpacas" {
|
||||
model = "virtio"
|
||||
bridge = var.bridge
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = all
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resource "proxmox_vm_qemu" "llamas" {
|
||||
count = var.llama_vm_count
|
||||
name = "llama-${count.index + 1}"
|
||||
vmid = 600 + count.index + 1
|
||||
target_node = var.target_node
|
||||
clone = var.clone_template
|
||||
full_clone = false
|
||||
agent = 1
|
||||
resource "proxmox_vm_qemu" "workers" {
|
||||
count = var.worker_count
|
||||
name = "wk-${count.index + 1}"
|
||||
vmid = var.worker_vmid_start + count.index
|
||||
target_node = var.target_node
|
||||
clone = var.clone_template
|
||||
full_clone = true
|
||||
os_type = "cloud-init"
|
||||
agent = var.qemu_agent_enabled ? 1 : 0
|
||||
automatic_reboot = true
|
||||
|
||||
sockets = var.sockets
|
||||
cores = var.cores
|
||||
memory = var.memory
|
||||
scsihw = "virtio-scsi-pci"
|
||||
boot = "order=scsi0"
|
||||
ipconfig0 = "ip=dhcp"
|
||||
cicustom = "user=local:snippets/cloud_init_llama_${count.index + 1}.yaml"
|
||||
depends_on = [null_resource.upload_cloud_init_llama]
|
||||
|
||||
disk {
|
||||
slot = "scsi0"
|
||||
type = "disk"
|
||||
storage = var.storage
|
||||
size = var.disk_size
|
||||
cpu {
|
||||
sockets = 1
|
||||
cores = var.worker_cores[count.index]
|
||||
}
|
||||
memory = var.worker_memory_mb[count.index]
|
||||
scsihw = "virtio-scsi-pci"
|
||||
boot = "order=scsi0"
|
||||
bootdisk = "scsi0"
|
||||
ipconfig0 = local.worker_ipconfig[count.index]
|
||||
ciuser = "micqdf"
|
||||
sshkeys = var.SSH_KEY_PUBLIC
|
||||
|
||||
disks {
|
||||
scsi {
|
||||
scsi0 {
|
||||
disk {
|
||||
size = var.worker_disk_size
|
||||
storage = var.storage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ide {
|
||||
ide2 {
|
||||
cloudinit {
|
||||
storage = var.storage
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
network {
|
||||
id = 0
|
||||
model = "virtio"
|
||||
bridge = var.bridge
|
||||
}
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = all
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +1,35 @@
|
||||
output "alpaca_vm_ids" {
|
||||
output "control_plane_vm_ids" {
|
||||
value = {
|
||||
for i in range(var.alpaca_count) :
|
||||
"alpaca-${i + 1}" => proxmox_vm_qemu.alpacas[i].vmid
|
||||
for i in range(var.control_plane_count) :
|
||||
"cp-${i + 1}" => proxmox_vm_qemu.control_planes[i].vmid
|
||||
}
|
||||
}
|
||||
|
||||
output "alpaca_vm_names" {
|
||||
value = [for vm in proxmox_vm_qemu.alpacas : vm.name]
|
||||
output "control_plane_vm_names" {
|
||||
value = [for vm in proxmox_vm_qemu.control_planes : vm.name]
|
||||
}
|
||||
|
||||
output "llama_vm_ids" {
|
||||
output "control_plane_vm_ipv4" {
|
||||
value = {
|
||||
for i in range(var.llama_count) :
|
||||
"llama-${i + 1}" => proxmox_vm_qemu.llamas[i].vmid
|
||||
for i in range(var.control_plane_count) :
|
||||
proxmox_vm_qemu.control_planes[i].name => var.control_plane_ips[i]
|
||||
}
|
||||
}
|
||||
|
||||
output "llama_vm_names" {
|
||||
value = [for vm in proxmox_vm_qemu.llamas : vm.name]
|
||||
output "worker_vm_ids" {
|
||||
value = {
|
||||
for i in range(var.worker_count) :
|
||||
"wk-${i + 1}" => proxmox_vm_qemu.workers[i].vmid
|
||||
}
|
||||
}
|
||||
|
||||
output "worker_vm_names" {
|
||||
value = [for vm in proxmox_vm_qemu.workers : vm.name]
|
||||
}
|
||||
|
||||
output "worker_vm_ipv4" {
|
||||
value = {
|
||||
for i in range(var.worker_count) :
|
||||
proxmox_vm_qemu.workers[i].name => var.worker_ips[i]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,25 @@
|
||||
target_node = "flex"
|
||||
clone_template = "Alpine-TemplateV2"
|
||||
vm_name = "alpine-vm"
|
||||
cores = 2
|
||||
memory = 2048
|
||||
disk_size = "15G"
|
||||
sockets = 1
|
||||
bridge = "vmbr0"
|
||||
disk_type = "scsi"
|
||||
storage = "Flash"
|
||||
pm_api_url = "https://100.105.0.115:8006/api2/json"
|
||||
pm_user = "terraform-prov@pve"
|
||||
target_node = "flex"
|
||||
clone_template = "k8s-base-template"
|
||||
bridge = "vmbr0"
|
||||
storage = "Flash"
|
||||
pm_api_url = "https://100.105.0.115:8006/api2/json"
|
||||
pm_api_token_id = "terraform-prov@pve!mytoken"
|
||||
|
||||
control_plane_count = 3
|
||||
worker_count = 3
|
||||
control_plane_vmid_start = 701
|
||||
worker_vmid_start = 711
|
||||
|
||||
control_plane_cores = 1
|
||||
control_plane_memory_mb = 4096
|
||||
control_plane_disk_size = "80G"
|
||||
|
||||
worker_cores = [4, 4, 4]
|
||||
worker_memory_mb = [12288, 12288, 12288]
|
||||
worker_disk_size = "120G"
|
||||
|
||||
network_prefix_length = 10
|
||||
network_gateway = "10.27.27.1"
|
||||
|
||||
control_plane_ips = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||
worker_ips = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||
|
||||
@@ -1,5 +1,22 @@
|
||||
variable "proxmox_password" {
|
||||
type = string
|
||||
variable "pm_api_token_id" {
|
||||
type = string
|
||||
description = "Proxmox API token ID (format: user@realm!tokenid)"
|
||||
|
||||
validation {
|
||||
condition = can(regex(".+!.+", trimspace(var.pm_api_token_id)))
|
||||
error_message = "pm_api_token_id must be in format user@realm!tokenid."
|
||||
}
|
||||
}
|
||||
|
||||
variable "pm_api_token_secret" {
|
||||
type = string
|
||||
sensitive = true
|
||||
description = "Proxmox API token secret"
|
||||
|
||||
validation {
|
||||
condition = length(trimspace(var.pm_api_token_secret)) > 0
|
||||
error_message = "pm_api_token_secret cannot be empty. Check your Gitea secret PM_API_TOKEN_SECRET."
|
||||
}
|
||||
}
|
||||
|
||||
variable "target_node" {
|
||||
@@ -10,34 +27,104 @@ variable "clone_template" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "vm_name" {
|
||||
type = string
|
||||
variable "control_plane_count" {
|
||||
type = number
|
||||
default = 3
|
||||
description = "Number of control plane VMs"
|
||||
}
|
||||
|
||||
variable "cores" {
|
||||
type = number
|
||||
variable "worker_count" {
|
||||
type = number
|
||||
default = 3
|
||||
description = "Number of worker VMs"
|
||||
}
|
||||
|
||||
variable "memory" {
|
||||
type = number
|
||||
variable "control_plane_vmid_start" {
|
||||
type = number
|
||||
default = 701
|
||||
description = "Starting VMID for control plane VMs"
|
||||
}
|
||||
|
||||
variable "disk_size" {
|
||||
type = string
|
||||
variable "worker_vmid_start" {
|
||||
type = number
|
||||
default = 711
|
||||
description = "Starting VMID for worker VMs"
|
||||
}
|
||||
|
||||
variable "sockets" {
|
||||
type = number
|
||||
variable "control_plane_cores" {
|
||||
type = number
|
||||
default = 1
|
||||
description = "vCPU cores per control plane VM"
|
||||
}
|
||||
|
||||
variable "control_plane_memory_mb" {
|
||||
type = number
|
||||
default = 4096
|
||||
description = "Memory in MB per control plane VM"
|
||||
}
|
||||
|
||||
variable "worker_cores" {
|
||||
type = list(number)
|
||||
default = [4, 4, 4]
|
||||
description = "vCPU cores for each worker VM"
|
||||
}
|
||||
|
||||
variable "worker_memory_mb" {
|
||||
type = list(number)
|
||||
default = [12288, 12288, 12288]
|
||||
description = "Memory in MB for each worker VM"
|
||||
}
|
||||
|
||||
variable "control_plane_disk_size" {
|
||||
type = string
|
||||
default = "80G"
|
||||
description = "Disk size for control plane VMs"
|
||||
}
|
||||
|
||||
variable "worker_disk_size" {
|
||||
type = string
|
||||
default = "120G"
|
||||
description = "Disk size for worker VMs"
|
||||
}
|
||||
|
||||
variable "network_prefix_length" {
|
||||
type = number
|
||||
default = 10
|
||||
description = "CIDR prefix length for static VM addresses"
|
||||
}
|
||||
|
||||
variable "network_gateway" {
|
||||
type = string
|
||||
default = "10.27.27.1"
|
||||
description = "Gateway for static VM addresses"
|
||||
}
|
||||
|
||||
variable "control_plane_ips" {
|
||||
type = list(string)
|
||||
default = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||
description = "Static IPv4 addresses for control plane VMs"
|
||||
|
||||
validation {
|
||||
condition = length(var.control_plane_ips) == 3
|
||||
error_message = "control_plane_ips must contain exactly 3 IPs."
|
||||
}
|
||||
}
|
||||
|
||||
variable "worker_ips" {
|
||||
type = list(string)
|
||||
default = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||
description = "Static IPv4 addresses for worker VMs"
|
||||
|
||||
validation {
|
||||
condition = length(var.worker_ips) == 3
|
||||
error_message = "worker_ips must contain exactly 3 IPs."
|
||||
}
|
||||
}
|
||||
|
||||
variable "bridge" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "disk_type" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "storage" {
|
||||
type = string
|
||||
}
|
||||
@@ -46,42 +133,13 @@ variable "pm_api_url" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "pm_user" {
|
||||
type = string
|
||||
variable "qemu_agent_enabled" {
|
||||
type = bool
|
||||
default = false
|
||||
description = "Enable QEMU guest agent integration in Proxmox resources"
|
||||
}
|
||||
|
||||
variable "alpaca_count" {
|
||||
type = number
|
||||
default = 1
|
||||
description = "How many Alpaca VMs to create"
|
||||
}
|
||||
|
||||
variable "llama_count" {
|
||||
type = number
|
||||
default = 1
|
||||
description = "How many Llama VMs to create"
|
||||
}
|
||||
|
||||
variable "alpaca_vm_count" {
|
||||
type = number
|
||||
default = 1
|
||||
description = "How many Alpaca VMs to create"
|
||||
}
|
||||
|
||||
variable "llama_vm_count" {
|
||||
type = number
|
||||
default = 1
|
||||
description = "How many Llama VMs to create"
|
||||
}
|
||||
|
||||
variable "TS_AUTHKEY" {
|
||||
variable "SSH_KEY_PUBLIC" {
|
||||
type = string
|
||||
description = "Tailscale auth key used in cloud-init"
|
||||
description = "Public SSH key injected via cloud-init"
|
||||
}
|
||||
|
||||
|
||||
variable "ssh_key" {
|
||||
type = string
|
||||
description = "Public SSH key used by cloud-init"
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user