recovery(metis): restore node identity on rebuilt images
This commit is contained in:
parent
ebaa367efd
commit
17069e4677
@ -154,6 +154,10 @@ if [ -s "${sudoers_file}" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -x /usr/local/sbin/metis-apply-node-identity.sh ]; then
|
||||
/usr/local/sbin/metis-apply-node-identity.sh || true
|
||||
fi
|
||||
|
||||
rm -f /root/.not_logged_in_yet
|
||||
|
||||
if ! command -v k3s >/dev/null 2>&1; then
|
||||
|
||||
@ -2,8 +2,6 @@ package plan
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@ -64,6 +62,7 @@ func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error)
|
||||
cfg.Secrets = sec.Extra
|
||||
}
|
||||
}
|
||||
applyNodeMetadataEnv(cfg)
|
||||
files, err := buildFiles(cfg, sec)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -111,7 +110,9 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
|
||||
{Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true},
|
||||
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg, sec)), Mode: 0o600, RootFS: true},
|
||||
{Path: "usr/local/sbin/metis-apply-node-identity.sh", Content: []byte(nodeIdentityScriptContent()), Mode: 0o755, RootFS: true},
|
||||
{Path: "etc/cloud/cloud.cfg.d/90-metis-recovery.cfg", Content: []byte(cloudInitRootFSContent(sec)), Mode: 0o644, RootFS: true},
|
||||
}
|
||||
if cfg.IP != "" {
|
||||
files = append(files, inject.FileSpec{
|
||||
@ -148,6 +149,14 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
if passwordAuth := sshPasswordConfigContent(sec); passwordAuth != "" {
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/ssh/sshd_config.d/90-metis-password-auth.conf",
|
||||
Content: []byte(passwordAuth),
|
||||
Mode: 0o644,
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
if cfg.SSHUser == "atlas" {
|
||||
sudoers := hecateSudoersContent(cfg.SSHUser)
|
||||
files = append(files, inject.FileSpec{
|
||||
@ -172,8 +181,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
|
||||
})
|
||||
}
|
||||
|
||||
// Store the raw config for debugging/ops.
|
||||
raw, err := json.MarshalIndent(cfg, "", " ")
|
||||
raw, err := jsonMarshalIndent(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -184,7 +192,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
|
||||
RootFS: true,
|
||||
})
|
||||
if sec != nil {
|
||||
secRaw, err := json.MarshalIndent(sec, "", " ")
|
||||
secRaw, err := jsonMarshalIndent(redactedSecretsForImage(sec))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -196,7 +204,6 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
|
||||
})
|
||||
}
|
||||
|
||||
// Optional cloud-init for images that honor NoCloud.
|
||||
userData := cloudInitUserData(cfg, sec)
|
||||
if userData != "" {
|
||||
files = append(files, inject.FileSpec{
|
||||
@ -267,33 +274,6 @@ func allowK3sNodeLabel(role, key string) bool {
|
||||
return !strings.HasPrefix(key, "node-role.kubernetes.io/")
|
||||
}
|
||||
|
||||
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
|
||||
if cfg == nil {
|
||||
return ""
|
||||
}
|
||||
if sec != nil && sec.CloudInit != "" {
|
||||
return sec.CloudInit
|
||||
}
|
||||
var b bytes.Buffer
|
||||
b.WriteString("#cloud-config\n")
|
||||
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
|
||||
if len(cfg.SSHKeys) > 0 {
|
||||
b.WriteString("ssh_authorized_keys:\n")
|
||||
for _, k := range cfg.SSHKeys {
|
||||
b.WriteString(fmt.Sprintf(" - %s\n", k))
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func firstbootEnvContent(cfg *config.NodeConfig) string {
|
||||
var b bytes.Buffer
|
||||
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
|
||||
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
|
||||
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func networkManagerConnectionContent(id, iface, ip string) string {
|
||||
gateway := ip
|
||||
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
|
||||
@ -347,7 +327,6 @@ func fstabAppendContent(cfg *config.NodeConfig) string {
|
||||
source := entry.Source
|
||||
switch {
|
||||
case source != "":
|
||||
// Use the explicit source path for bind mounts.
|
||||
case entry.UUID != "":
|
||||
source = "UUID=" + entry.UUID
|
||||
case entry.Label != "":
|
||||
@ -374,25 +353,6 @@ func hecateSudoersContent(user string) string {
|
||||
)
|
||||
}
|
||||
|
||||
func shellQuote(value string) string {
|
||||
if value == "" {
|
||||
return "''"
|
||||
}
|
||||
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
|
||||
}
|
||||
|
||||
func fetchSecrets(hostname string) *secrets.NodeSecrets {
|
||||
if os.Getenv("VAULT_ADDR") == "" {
|
||||
return nil
|
||||
}
|
||||
cli := secrets.NewFromEnv()
|
||||
sec, err := cli.FetchNode(context.Background(), hostname)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return sec
|
||||
}
|
||||
|
||||
func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) {
|
||||
var files []inject.FileSpec
|
||||
if class == nil {
|
||||
|
||||
@ -174,3 +174,80 @@ func TestBuildFilesAddsHecateSudoersForAtlas(t *testing.T) {
|
||||
t.Fatalf("metis sudoers backup missing/incorrect: %s", backup)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildFilesAddsPasswordArtifactsAndRedactsSecrets(t *testing.T) {
|
||||
cfg := &config.NodeConfig{
|
||||
Hostname: "titan-15",
|
||||
IP: "192.168.22.43",
|
||||
SSHUser: "atlas",
|
||||
SSHKeys: []string{"ssh-ed25519 AAA test"},
|
||||
K3s: config.K3sConfig{
|
||||
Role: "agent",
|
||||
Version: "v1.31.5+k3s1",
|
||||
},
|
||||
}
|
||||
sec := &secrets.NodeSecrets{
|
||||
SSHPassword: "atlas-pass",
|
||||
RootPassword: "root-pass",
|
||||
K3sToken: "super-secret-token",
|
||||
Extra: map[string]string{"api_key": "secret"},
|
||||
}
|
||||
files, err := buildFiles(cfg, sec)
|
||||
if err != nil {
|
||||
t.Fatalf("buildFiles: %v", err)
|
||||
}
|
||||
pathMap := map[string]string{}
|
||||
for _, file := range files {
|
||||
pathMap[file.Path] = string(file.Content)
|
||||
}
|
||||
firstboot := pathMap["etc/metis/firstboot.env"]
|
||||
if !strings.Contains(firstboot, "METIS_ATLAS_PASSWORD='atlas-pass'") || !strings.Contains(firstboot, "METIS_ROOT_PASSWORD='root-pass'") {
|
||||
t.Fatalf("firstboot env missing password material: %s", firstboot)
|
||||
}
|
||||
if sshd := pathMap["etc/ssh/sshd_config.d/90-metis-password-auth.conf"]; !strings.Contains(sshd, "PasswordAuthentication yes") || !strings.Contains(sshd, "PermitRootLogin yes") {
|
||||
t.Fatalf("password auth config missing: %s", sshd)
|
||||
}
|
||||
if script := pathMap["usr/local/sbin/metis-apply-node-identity.sh"]; !strings.Contains(script, "apply_password root") || !strings.Contains(script, "METIS_ATLAS_PASSWORD") {
|
||||
t.Fatalf("node identity script missing password application: %s", script)
|
||||
}
|
||||
if cloudCfg := pathMap["etc/cloud/cloud.cfg.d/90-metis-recovery.cfg"]; !strings.Contains(cloudCfg, "ssh_pwauth: true") {
|
||||
t.Fatalf("cloud recovery config missing ssh_pwauth: %s", cloudCfg)
|
||||
}
|
||||
if userData := pathMap["user-data"]; !strings.Contains(userData, "ssh_pwauth: true") || !strings.Contains(userData, "metis-apply-node-identity.sh") {
|
||||
t.Fatalf("cloud-init user-data missing recovery hooks: %s", userData)
|
||||
}
|
||||
secretsJSON := pathMap["etc/metis/secrets.json"]
|
||||
if strings.Contains(secretsJSON, "atlas-pass") || strings.Contains(secretsJSON, "root-pass") || strings.Contains(secretsJSON, "super-secret-token") {
|
||||
t.Fatalf("secrets.json should be redacted: %s", secretsJSON)
|
||||
}
|
||||
if !strings.Contains(secretsJSON, `"has_ssh_password": true`) || !strings.Contains(secretsJSON, `"extra_keys": [`) {
|
||||
t.Fatalf("secrets.json should keep redacted debug metadata: %s", secretsJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyNodeMetadataEnv(t *testing.T) {
|
||||
cfg := &config.NodeConfig{
|
||||
Labels: map[string]string{"hardware": "rpi4"},
|
||||
Taints: []string{"flash=true:NoSchedule"},
|
||||
K3s: config.K3sConfig{
|
||||
Labels: map[string]string{"hardware": "rpi4"},
|
||||
Taints: []string{"flash=true:NoSchedule"},
|
||||
},
|
||||
}
|
||||
t.Setenv("METIS_NODE_LABELS_JSON", `{"hardware":"rpi5","maintenance.bstein.dev/role":"recovery"}`)
|
||||
t.Setenv("METIS_NODE_TAINTS_JSON", `["dedicated=recovery:NoSchedule","flash=true:NoSchedule"]`)
|
||||
applyNodeMetadataEnv(cfg)
|
||||
if cfg.Labels["hardware"] != "rpi5" || cfg.Labels["maintenance.bstein.dev/role"] != "recovery" {
|
||||
t.Fatalf("applyNodeMetadataEnv labels = %#v", cfg.Labels)
|
||||
}
|
||||
if !strings.Contains(strings.Join(cfg.Taints, ","), "dedicated=recovery:NoSchedule") {
|
||||
t.Fatalf("applyNodeMetadataEnv taints = %#v", cfg.Taints)
|
||||
}
|
||||
cfg = &config.NodeConfig{}
|
||||
t.Setenv("METIS_NODE_LABELS_JSON", `{bad-json`)
|
||||
t.Setenv("METIS_NODE_TAINTS_JSON", `{bad-json`)
|
||||
applyNodeMetadataEnv(cfg)
|
||||
if cfg.Labels != nil || cfg.Taints != nil {
|
||||
t.Fatalf("invalid env JSON should be ignored: %#v", cfg)
|
||||
}
|
||||
}
|
||||
|
||||
262
pkg/plan/node_identity.go
Normal file
262
pkg/plan/node_identity.go
Normal file
@ -0,0 +1,262 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"metis/pkg/config"
|
||||
"metis/pkg/secrets"
|
||||
)
|
||||
|
||||
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
|
||||
if cfg == nil {
|
||||
return ""
|
||||
}
|
||||
if sec != nil && sec.CloudInit != "" {
|
||||
return sec.CloudInit
|
||||
}
|
||||
var b bytes.Buffer
|
||||
b.WriteString("#cloud-config\n")
|
||||
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
|
||||
if len(cfg.SSHKeys) > 0 {
|
||||
b.WriteString("ssh_authorized_keys:\n")
|
||||
for _, k := range cfg.SSHKeys {
|
||||
b.WriteString(fmt.Sprintf(" - %s\n", k))
|
||||
}
|
||||
}
|
||||
if hasNodePasswords(sec) {
|
||||
b.WriteString("ssh_pwauth: true\n")
|
||||
b.WriteString("disable_root: false\n")
|
||||
}
|
||||
b.WriteString("runcmd:\n")
|
||||
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func firstbootEnvContent(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
|
||||
var b bytes.Buffer
|
||||
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
|
||||
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
|
||||
b.WriteString("METIS_ATLAS_USER='atlas'\n")
|
||||
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
|
||||
if sec != nil {
|
||||
if value := effectiveAtlasPassword(sec); value != "" {
|
||||
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD=%s\n", shellQuote(value)))
|
||||
}
|
||||
if value := effectiveAtlasPasswordHash(sec); value != "" {
|
||||
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD_HASH=%s\n", shellQuote(value)))
|
||||
}
|
||||
if value := strings.TrimSpace(sec.RootPassword); value != "" {
|
||||
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD=%s\n", shellQuote(value)))
|
||||
}
|
||||
if value := strings.TrimSpace(sec.RootPasswordHash); value != "" {
|
||||
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD_HASH=%s\n", shellQuote(value)))
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func cloudInitRootFSContent(sec *secrets.NodeSecrets) string {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("#cloud-config\n")
|
||||
if hasNodePasswords(sec) {
|
||||
b.WriteString("ssh_pwauth: true\n")
|
||||
b.WriteString("disable_root: false\n")
|
||||
}
|
||||
b.WriteString("runcmd:\n")
|
||||
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func nodeIdentityScriptContent() string {
|
||||
return `#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
marker="/var/lib/metis/node-identity-applied.done"
|
||||
env_file="/etc/metis/firstboot.env"
|
||||
key_file="/etc/metis/authorized_keys"
|
||||
sudoers_file="/etc/metis/sudoers-hecate"
|
||||
default_groups=(adm sudo tty disk dialout audio video plugdev games users systemd-journal input render netdev)
|
||||
|
||||
if [ -f "${marker}" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p /var/lib/metis
|
||||
if [ -f "${env_file}" ]; then
|
||||
# shellcheck disable=SC1090
|
||||
. "${env_file}"
|
||||
fi
|
||||
|
||||
atlas_user="${METIS_ATLAS_USER:-atlas}"
|
||||
ssh_user="${METIS_SSH_USER:-${atlas_user}}"
|
||||
atlas_password="${METIS_ATLAS_PASSWORD:-}"
|
||||
atlas_password_hash="${METIS_ATLAS_PASSWORD_HASH:-}"
|
||||
root_password="${METIS_ROOT_PASSWORD:-}"
|
||||
root_password_hash="${METIS_ROOT_PASSWORD_HASH:-}"
|
||||
|
||||
group_list=()
|
||||
for group_name in "${default_groups[@]}"; do
|
||||
if getent group "${group_name}" >/dev/null 2>&1; then
|
||||
group_list+=("${group_name}")
|
||||
fi
|
||||
done
|
||||
if [ "${#group_list[@]}" -gt 0 ]; then
|
||||
group_csv="$(IFS=,; printf '%s' "${group_list[*]}")"
|
||||
else
|
||||
group_csv=""
|
||||
fi
|
||||
|
||||
ensure_user() {
|
||||
local user_name="$1"
|
||||
[ -n "${user_name}" ] || return 0
|
||||
if ! id "${user_name}" >/dev/null 2>&1; then
|
||||
if [ -n "${group_csv}" ]; then
|
||||
useradd -m -s /bin/bash -G "${group_csv}" "${user_name}"
|
||||
else
|
||||
useradd -m -s /bin/bash "${user_name}"
|
||||
fi
|
||||
elif [ -n "${group_csv}" ]; then
|
||||
usermod -a -G "${group_csv}" "${user_name}" || true
|
||||
fi
|
||||
}
|
||||
|
||||
apply_password() {
|
||||
local user_name="$1"
|
||||
local plain_password="$2"
|
||||
local hash_password="$3"
|
||||
if ! id "${user_name}" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
if [ -n "${hash_password}" ]; then
|
||||
usermod -p "${hash_password}" "${user_name}"
|
||||
passwd -u "${user_name}" >/dev/null 2>&1 || true
|
||||
return 0
|
||||
fi
|
||||
if [ -n "${plain_password}" ]; then
|
||||
printf '%s:%s\n' "${user_name}" "${plain_password}" | chpasswd
|
||||
passwd -u "${user_name}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
install_keys() {
|
||||
local user_name="$1"
|
||||
[ -n "${user_name}" ] || return 0
|
||||
[ -s "${key_file}" ] || return 0
|
||||
local home_dir
|
||||
home_dir="$(getent passwd "${user_name}" | cut -d: -f6)"
|
||||
if [ -z "${home_dir}" ]; then
|
||||
if [ "${user_name}" = "root" ]; then
|
||||
home_dir="/root"
|
||||
else
|
||||
home_dir="/home/${user_name}"
|
||||
fi
|
||||
fi
|
||||
install -d -m 700 "${home_dir}/.ssh"
|
||||
install -m 600 "${key_file}" "${home_dir}/.ssh/authorized_keys"
|
||||
chown -R "${user_name}:${user_name}" "${home_dir}/.ssh" 2>/dev/null || true
|
||||
}
|
||||
|
||||
ensure_user "${atlas_user}"
|
||||
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
|
||||
ensure_user "${ssh_user}"
|
||||
fi
|
||||
|
||||
apply_password root "${root_password}" "${root_password_hash}"
|
||||
apply_password "${atlas_user}" "${atlas_password}" "${atlas_password_hash}"
|
||||
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
|
||||
apply_password "${ssh_user}" "${atlas_password}" "${atlas_password_hash}"
|
||||
fi
|
||||
|
||||
if [ -s "${key_file}" ]; then
|
||||
install_keys root
|
||||
install_keys "${atlas_user}"
|
||||
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
|
||||
install_keys "${ssh_user}"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -s "${sudoers_file}" ]; then
|
||||
install -d -m 755 /etc/sudoers.d
|
||||
install -m 440 "${sudoers_file}" /etc/sudoers.d/90-hecate-atlas
|
||||
if command -v visudo >/dev/null 2>&1; then
|
||||
visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null 2>&1 || rm -f /etc/sudoers.d/90-hecate-atlas
|
||||
fi
|
||||
fi
|
||||
|
||||
systemctl restart ssh.service >/dev/null 2>&1 || systemctl restart sshd.service >/dev/null 2>&1 || systemctl restart ssh.socket >/dev/null 2>&1 || true
|
||||
touch "${marker}"
|
||||
`
|
||||
}
|
||||
|
||||
func sshPasswordConfigContent(sec *secrets.NodeSecrets) string {
|
||||
if !hasNodePasswords(sec) {
|
||||
return ""
|
||||
}
|
||||
return "PasswordAuthentication yes\nKbdInteractiveAuthentication no\nChallengeResponseAuthentication no\nPermitRootLogin yes\nUsePAM yes\n"
|
||||
}
|
||||
|
||||
func hasNodePasswords(sec *secrets.NodeSecrets) bool {
|
||||
if sec == nil {
|
||||
return false
|
||||
}
|
||||
return effectiveAtlasPassword(sec) != "" || effectiveAtlasPasswordHash(sec) != "" || firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != ""
|
||||
}
|
||||
|
||||
func effectiveAtlasPassword(sec *secrets.NodeSecrets) string {
|
||||
if sec == nil {
|
||||
return ""
|
||||
}
|
||||
return firstNonEmptyString(sec.AtlasPassword, sec.SSHPassword)
|
||||
}
|
||||
|
||||
func effectiveAtlasPasswordHash(sec *secrets.NodeSecrets) string {
|
||||
if sec == nil {
|
||||
return ""
|
||||
}
|
||||
return firstNonEmptyString(sec.AtlasPasswordHash, sec.SSHPasswordHash)
|
||||
}
|
||||
|
||||
func firstNonEmptyString(values ...string) string {
|
||||
for _, value := range values {
|
||||
if trimmed := strings.TrimSpace(value); trimmed != "" {
|
||||
return trimmed
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func redactedSecretsForImage(sec *secrets.NodeSecrets) map[string]any {
|
||||
if sec == nil {
|
||||
return nil
|
||||
}
|
||||
debug := map[string]any{
|
||||
"has_ssh_password": firstNonEmptyString(sec.SSHPassword, sec.SSHPasswordHash) != "",
|
||||
"has_atlas_password": firstNonEmptyString(sec.AtlasPassword, sec.AtlasPasswordHash) != "",
|
||||
"has_root_password": firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "",
|
||||
"has_k3s_token": strings.TrimSpace(sec.K3sToken) != "",
|
||||
"has_cloud_init_override": strings.TrimSpace(sec.CloudInit) != "",
|
||||
}
|
||||
if len(sec.Extra) > 0 {
|
||||
keys := make([]string, 0, len(sec.Extra))
|
||||
for key := range sec.Extra {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
debug["extra_keys"] = keys
|
||||
}
|
||||
return debug
|
||||
}
|
||||
|
||||
func shellQuote(value string) string {
|
||||
if value == "" {
|
||||
return "''"
|
||||
}
|
||||
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
|
||||
}
|
||||
133
pkg/plan/node_metadata.go
Normal file
133
pkg/plan/node_metadata.go
Normal file
@ -0,0 +1,133 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"metis/pkg/config"
|
||||
"metis/pkg/secrets"
|
||||
)
|
||||
|
||||
func fetchSecrets(hostname string) *secrets.NodeSecrets {
|
||||
envSecrets := nodeSecretsFromEnv()
|
||||
if os.Getenv("VAULT_ADDR") == "" {
|
||||
return envSecrets
|
||||
}
|
||||
cli := secrets.NewFromEnv()
|
||||
sec, err := cli.FetchNode(context.Background(), hostname)
|
||||
if err != nil {
|
||||
return envSecrets
|
||||
}
|
||||
return mergeNodeSecrets(sec, envSecrets)
|
||||
}
|
||||
|
||||
func nodeSecretsFromEnv() *secrets.NodeSecrets {
|
||||
sec := &secrets.NodeSecrets{
|
||||
SSHPassword: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD")),
|
||||
SSHPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD_HASH")),
|
||||
AtlasPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD")),
|
||||
AtlasPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD_HASH")),
|
||||
RootPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD")),
|
||||
RootPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD_HASH")),
|
||||
}
|
||||
if sec.SSHPassword == "" && sec.SSHPasswordHash == "" && sec.AtlasPassword == "" && sec.AtlasPasswordHash == "" && sec.RootPassword == "" && sec.RootPasswordHash == "" {
|
||||
return nil
|
||||
}
|
||||
return sec
|
||||
}
|
||||
|
||||
func mergeNodeSecrets(base, override *secrets.NodeSecrets) *secrets.NodeSecrets {
|
||||
if base == nil {
|
||||
return override
|
||||
}
|
||||
if override == nil {
|
||||
return base
|
||||
}
|
||||
merged := *base
|
||||
merged.SSHPassword = firstNonEmptyString(override.SSHPassword, base.SSHPassword)
|
||||
merged.SSHPasswordHash = firstNonEmptyString(override.SSHPasswordHash, base.SSHPasswordHash)
|
||||
merged.AtlasPassword = firstNonEmptyString(override.AtlasPassword, base.AtlasPassword)
|
||||
merged.AtlasPasswordHash = firstNonEmptyString(override.AtlasPasswordHash, base.AtlasPasswordHash)
|
||||
merged.RootPassword = firstNonEmptyString(override.RootPassword, base.RootPassword)
|
||||
merged.RootPasswordHash = firstNonEmptyString(override.RootPasswordHash, base.RootPasswordHash)
|
||||
merged.K3sToken = firstNonEmptyString(override.K3sToken, base.K3sToken)
|
||||
merged.CloudInit = firstNonEmptyString(override.CloudInit, base.CloudInit)
|
||||
if len(base.Extra) > 0 || len(override.Extra) > 0 {
|
||||
merged.Extra = map[string]string{}
|
||||
for key, value := range base.Extra {
|
||||
merged.Extra[key] = value
|
||||
}
|
||||
for key, value := range override.Extra {
|
||||
merged.Extra[key] = value
|
||||
}
|
||||
}
|
||||
return &merged
|
||||
}
|
||||
|
||||
func applyNodeMetadataEnv(cfg *config.NodeConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if labels := parseEnvJSONMap(os.Getenv("METIS_NODE_LABELS_JSON")); len(labels) > 0 {
|
||||
if cfg.Labels == nil {
|
||||
cfg.Labels = map[string]string{}
|
||||
}
|
||||
for key, value := range labels {
|
||||
cfg.Labels[key] = value
|
||||
}
|
||||
cfg.K3s.Labels = cfg.Labels
|
||||
}
|
||||
if taints := parseEnvJSONList(os.Getenv("METIS_NODE_TAINTS_JSON")); len(taints) > 0 {
|
||||
cfg.Taints = uniqueStrings(append(cfg.Taints, taints...))
|
||||
cfg.K3s.Taints = append([]string{}, cfg.Taints...)
|
||||
}
|
||||
}
|
||||
|
||||
func parseEnvJSONMap(raw string) map[string]string {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil
|
||||
}
|
||||
var values map[string]string
|
||||
if err := json.Unmarshal([]byte(raw), &values); err != nil {
|
||||
return nil
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
func parseEnvJSONList(raw string) []string {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil
|
||||
}
|
||||
var values []string
|
||||
if err := json.Unmarshal([]byte(raw), &values); err != nil {
|
||||
return nil
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
func uniqueStrings(values []string) []string {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func jsonMarshalIndent(value any) ([]byte, error) {
|
||||
return json.MarshalIndent(value, "", " ")
|
||||
}
|
||||
127
pkg/plan/node_secrets_test.go
Normal file
127
pkg/plan/node_secrets_test.go
Normal file
@ -0,0 +1,127 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"metis/pkg/config"
|
||||
"metis/pkg/secrets"
|
||||
)
|
||||
|
||||
func TestNodeSecretHelpers(t *testing.T) {
|
||||
if got := effectiveAtlasPassword(nil); got != "" {
|
||||
t.Fatalf("effectiveAtlasPassword(nil) = %q", got)
|
||||
}
|
||||
if got := effectiveAtlasPasswordHash(nil); got != "" {
|
||||
t.Fatalf("effectiveAtlasPasswordHash(nil) = %q", got)
|
||||
}
|
||||
sec := &secrets.NodeSecrets{SSHPassword: "ssh-pass", SSHPasswordHash: "$ssh$hash"}
|
||||
if got := effectiveAtlasPassword(sec); got != "ssh-pass" {
|
||||
t.Fatalf("effectiveAtlasPassword fallback = %q", got)
|
||||
}
|
||||
if got := effectiveAtlasPasswordHash(sec); got != "$ssh$hash" {
|
||||
t.Fatalf("effectiveAtlasPasswordHash fallback = %q", got)
|
||||
}
|
||||
sec.AtlasPassword = "atlas-pass"
|
||||
sec.AtlasPasswordHash = "$atlas$hash"
|
||||
if got := effectiveAtlasPassword(sec); got != "atlas-pass" {
|
||||
t.Fatalf("effectiveAtlasPassword explicit = %q", got)
|
||||
}
|
||||
if got := effectiveAtlasPasswordHash(sec); got != "$atlas$hash" {
|
||||
t.Fatalf("effectiveAtlasPasswordHash explicit = %q", got)
|
||||
}
|
||||
if got := firstNonEmptyString("", " value ", "ignored"); got != "value" {
|
||||
t.Fatalf("firstNonEmptyString = %q", got)
|
||||
}
|
||||
if !hasNodePasswords(&secrets.NodeSecrets{RootPasswordHash: "$root$hash"}) {
|
||||
t.Fatal("expected root password hash to count as password material")
|
||||
}
|
||||
if hasNodePasswords(&secrets.NodeSecrets{}) {
|
||||
t.Fatal("empty node secrets should not count as password material")
|
||||
}
|
||||
debug := redactedSecretsForImage(&secrets.NodeSecrets{Extra: map[string]string{"b": "2", "a": "1"}})
|
||||
if !reflect.DeepEqual(debug["extra_keys"], []string{"a", "b"}) {
|
||||
t.Fatalf("redactedSecretsForImage extra_keys = %#v", debug)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeSecretsFromEnvAndMergeNodeSecrets(t *testing.T) {
|
||||
t.Setenv("METIS_NODE_SSH_PASSWORD", "ssh-pass")
|
||||
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "$ssh$hash")
|
||||
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "atlas-pass")
|
||||
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "$atlas$hash")
|
||||
t.Setenv("METIS_NODE_ROOT_PASSWORD", "root-pass")
|
||||
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "$root$hash")
|
||||
envSecrets := nodeSecretsFromEnv()
|
||||
if envSecrets == nil || envSecrets.RootPassword != "root-pass" || envSecrets.AtlasPasswordHash != "$atlas$hash" {
|
||||
t.Fatalf("nodeSecretsFromEnv = %#v", envSecrets)
|
||||
}
|
||||
merged := mergeNodeSecrets(&secrets.NodeSecrets{
|
||||
SSHPassword: "base-ssh",
|
||||
K3sToken: "base-token",
|
||||
CloudInit: "base-cloud",
|
||||
Extra: map[string]string{"base": "1"},
|
||||
}, &secrets.NodeSecrets{
|
||||
AtlasPassword: "override-atlas",
|
||||
RootPassword: "override-root",
|
||||
K3sToken: "override-token",
|
||||
CloudInit: "override-cloud",
|
||||
Extra: map[string]string{"override": "2"},
|
||||
})
|
||||
if merged.K3sToken != "override-token" || merged.CloudInit != "override-cloud" || merged.AtlasPassword != "override-atlas" || merged.RootPassword != "override-root" {
|
||||
t.Fatalf("mergeNodeSecrets = %#v", merged)
|
||||
}
|
||||
if merged.Extra["base"] != "1" || merged.Extra["override"] != "2" {
|
||||
t.Fatalf("mergeNodeSecrets extras = %#v", merged.Extra)
|
||||
}
|
||||
if got := mergeNodeSecrets(nil, envSecrets); got.RootPasswordHash != "$root$hash" {
|
||||
t.Fatalf("mergeNodeSecrets nil base = %#v", got)
|
||||
}
|
||||
if got := mergeNodeSecrets(envSecrets, nil); got.SSHPassword != "ssh-pass" {
|
||||
t.Fatalf("mergeNodeSecrets nil override = %#v", got)
|
||||
}
|
||||
t.Setenv("METIS_NODE_SSH_PASSWORD", "")
|
||||
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "")
|
||||
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "")
|
||||
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "")
|
||||
t.Setenv("METIS_NODE_ROOT_PASSWORD", "")
|
||||
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "")
|
||||
if got := nodeSecretsFromEnv(); got != nil {
|
||||
t.Fatalf("expected empty env secrets to collapse to nil, got %#v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFirstbootEnvContentIncludesHashes(t *testing.T) {
|
||||
cfg := &config.NodeConfig{
|
||||
Hostname: "titan-15",
|
||||
SSHUser: "atlas",
|
||||
K3s: config.K3sConfig{Version: "v1.31.5+k3s1"},
|
||||
}
|
||||
content := firstbootEnvContent(cfg, &secrets.NodeSecrets{
|
||||
AtlasPasswordHash: "$atlas$hash",
|
||||
RootPasswordHash: "$root$hash",
|
||||
})
|
||||
if !reflect.DeepEqual(parseEnvLines(content), map[string]string{
|
||||
"METIS_HOSTNAME": "'titan-15'",
|
||||
"METIS_SSH_USER": "'atlas'",
|
||||
"METIS_ATLAS_USER": "'atlas'",
|
||||
"METIS_K3S_VERSION": "'v1.31.5+k3s1'",
|
||||
"METIS_ATLAS_PASSWORD_HASH": "'$atlas$hash'",
|
||||
"METIS_ROOT_PASSWORD_HASH": "'$root$hash'",
|
||||
}) {
|
||||
t.Fatalf("firstbootEnvContent = %q", content)
|
||||
}
|
||||
}
|
||||
|
||||
func parseEnvLines(raw string) map[string]string {
|
||||
result := map[string]string{}
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.SplitN(line, "=", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
result[parts[0]] = parts[1]
|
||||
}
|
||||
return result
|
||||
}
|
||||
@ -15,10 +15,15 @@ import (
|
||||
// NodeSecrets holds per-node secret material to inject at burn time.
|
||||
// These should live in Vault at secret/data/nodes/<hostname>.
|
||||
type NodeSecrets struct {
|
||||
SSHPassword string `json:"ssh_password,omitempty"`
|
||||
K3sToken string `json:"k3s_token,omitempty"`
|
||||
CloudInit string `json:"cloud_init,omitempty"`
|
||||
Extra map[string]string `json:"extra,omitempty"`
|
||||
SSHPassword string `json:"ssh_password,omitempty"`
|
||||
SSHPasswordHash string `json:"ssh_password_hash,omitempty"`
|
||||
AtlasPassword string `json:"atlas_password,omitempty"`
|
||||
AtlasPasswordHash string `json:"atlas_password_hash,omitempty"`
|
||||
RootPassword string `json:"root_password,omitempty"`
|
||||
RootPasswordHash string `json:"root_password_hash,omitempty"`
|
||||
K3sToken string `json:"k3s_token,omitempty"`
|
||||
CloudInit string `json:"cloud_init,omitempty"`
|
||||
Extra map[string]string `json:"extra,omitempty"`
|
||||
}
|
||||
|
||||
// Client fetches node secrets from Vault using either a token or AppRole.
|
||||
|
||||
@ -16,9 +16,11 @@ func TestFetchNodeReturnsData(t *testing.T) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"data": map[string]any{
|
||||
"data": map[string]any{
|
||||
"ssh_password": "p1",
|
||||
"k3s_token": "t1",
|
||||
"cloud_init": "ci",
|
||||
"ssh_password": "p1",
|
||||
"atlas_password_hash": "$atlas$hash",
|
||||
"root_password": "root-pw",
|
||||
"k3s_token": "t1",
|
||||
"cloud_init": "ci",
|
||||
},
|
||||
},
|
||||
})
|
||||
@ -33,7 +35,7 @@ func TestFetchNodeReturnsData(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("fetch: %v", err)
|
||||
}
|
||||
if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
|
||||
if sec.SSHPassword != "p1" || sec.AtlasPasswordHash != "$atlas$hash" || sec.RootPassword != "root-pw" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
|
||||
t.Fatalf("unexpected secrets: %+v", sec)
|
||||
}
|
||||
}
|
||||
|
||||
@ -117,16 +117,24 @@ type App struct {
|
||||
inventory *inventory.Inventory
|
||||
metrics *Metrics
|
||||
|
||||
mu sync.RWMutex
|
||||
jobs map[string]*Job
|
||||
snapshots map[string]SnapshotRecord
|
||||
targets map[string]facts.Targets
|
||||
artifactStore map[string]ArtifactSummary
|
||||
deviceStore map[string]deviceSnapshot
|
||||
mu sync.RWMutex
|
||||
jobs map[string]*Job
|
||||
snapshots map[string]SnapshotRecord
|
||||
targets map[string]facts.Targets
|
||||
artifactStore map[string]ArtifactSummary
|
||||
deviceStore map[string]deviceSnapshot
|
||||
desiredMetadata map[string]DesiredNodeMetadata
|
||||
}
|
||||
|
||||
// NewApp creates a Metis service app instance.
|
||||
func NewApp(settings Settings) (*App, error) {
|
||||
if strings.TrimSpace(settings.DesiredMetadataPath) == "" {
|
||||
baseDir := filepath.Dir(settings.SnapshotsPath)
|
||||
if strings.TrimSpace(baseDir) == "" || baseDir == "." {
|
||||
baseDir = filepath.Dir(settings.HistoryPath)
|
||||
}
|
||||
settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json")
|
||||
}
|
||||
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -141,18 +149,20 @@ func NewApp(settings Settings) (*App, error) {
|
||||
return nil, err
|
||||
}
|
||||
app := &App{
|
||||
settings: settings,
|
||||
inventory: inv,
|
||||
metrics: NewMetrics(),
|
||||
jobs: map[string]*Job{},
|
||||
snapshots: map[string]SnapshotRecord{},
|
||||
targets: map[string]facts.Targets{},
|
||||
artifactStore: map[string]ArtifactSummary{},
|
||||
deviceStore: map[string]deviceSnapshot{},
|
||||
settings: settings,
|
||||
inventory: inv,
|
||||
metrics: NewMetrics(),
|
||||
jobs: map[string]*Job{},
|
||||
snapshots: map[string]SnapshotRecord{},
|
||||
targets: map[string]facts.Targets{},
|
||||
artifactStore: map[string]ArtifactSummary{},
|
||||
deviceStore: map[string]deviceSnapshot{},
|
||||
desiredMetadata: map[string]DesiredNodeMetadata{},
|
||||
}
|
||||
_ = app.loadSnapshots()
|
||||
_ = app.loadTargets()
|
||||
_ = app.loadArtifacts()
|
||||
_ = app.loadDesiredNodeMetadata()
|
||||
return app, nil
|
||||
}
|
||||
|
||||
@ -289,6 +299,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
|
||||
if err := a.syncScratchAnnotations(record); err != nil {
|
||||
a.appendEvent(annotationSyncEvent(record.Node, err))
|
||||
}
|
||||
if err := a.syncDesiredNodeMetadata(record); err != nil {
|
||||
a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err))
|
||||
}
|
||||
a.appendEvent(Event{
|
||||
Time: record.CollectedAt,
|
||||
Kind: "sentinel.snapshot",
|
||||
|
||||
@ -22,6 +22,9 @@ type clusterNode struct {
|
||||
Worker bool
|
||||
ControlPlane bool
|
||||
Unschedulable bool
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
Taints []string
|
||||
USBScratchStatus string
|
||||
USBScratchManagedPaths string
|
||||
}
|
||||
@ -179,6 +182,11 @@ func clusterNodes() []clusterNode {
|
||||
} `json:"metadata"`
|
||||
Spec struct {
|
||||
Unschedulable bool `json:"unschedulable"`
|
||||
Taints []struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
Effect string `json:"effect"`
|
||||
} `json:"taints"`
|
||||
} `json:"spec"`
|
||||
} `json:"items"`
|
||||
}
|
||||
@ -189,6 +197,28 @@ func clusterNodes() []clusterNode {
|
||||
for _, item := range payload.Items {
|
||||
labels := item.Metadata.Labels
|
||||
annotations := item.Metadata.Annotations
|
||||
if labels == nil {
|
||||
labels = map[string]string{}
|
||||
}
|
||||
if annotations == nil {
|
||||
annotations = map[string]string{}
|
||||
}
|
||||
taints := make([]string, 0, len(item.Spec.Taints))
|
||||
for _, taint := range item.Spec.Taints {
|
||||
key := strings.TrimSpace(taint.Key)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
raw := key
|
||||
if value := strings.TrimSpace(taint.Value); value != "" {
|
||||
raw += "=" + value
|
||||
}
|
||||
if effect := strings.TrimSpace(taint.Effect); effect != "" {
|
||||
raw += ":" + effect
|
||||
}
|
||||
taints = append(taints, raw)
|
||||
}
|
||||
sort.Strings(taints)
|
||||
nodes = append(nodes, clusterNode{
|
||||
Name: strings.TrimSpace(item.Metadata.Name),
|
||||
Arch: strings.TrimSpace(labels["kubernetes.io/arch"]),
|
||||
@ -196,6 +226,9 @@ func clusterNodes() []clusterNode {
|
||||
Worker: labels["node-role.kubernetes.io/worker"] == "true",
|
||||
ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "",
|
||||
Unschedulable: item.Spec.Unschedulable,
|
||||
Labels: labels,
|
||||
Annotations: annotations,
|
||||
Taints: taints,
|
||||
USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]),
|
||||
USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]),
|
||||
})
|
||||
|
||||
@ -37,6 +37,11 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
|
||||
t.Fatal("expected persistTargets to fail when parent is a file")
|
||||
}
|
||||
|
||||
app.settings.DesiredMetadataPath = filepath.Join(fileParent, "desired-node-metadata.json")
|
||||
if err := app.persistDesiredNodeMetadata(); err == nil {
|
||||
t.Fatal("expected persistDesiredNodeMetadata to fail when parent is a file")
|
||||
}
|
||||
|
||||
invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json")
|
||||
if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
@ -45,6 +50,15 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
|
||||
if err := app.loadArtifacts(); err == nil {
|
||||
t.Fatal("expected loadArtifacts to reject invalid json")
|
||||
}
|
||||
|
||||
invalidDesiredState := filepath.Join(t.TempDir(), "desired-node-metadata.json")
|
||||
if err := os.WriteFile(invalidDesiredState, []byte("{bad-json"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
app.settings.DesiredMetadataPath = invalidDesiredState
|
||||
if err := app.loadDesiredNodeMetadata(); err == nil {
|
||||
t.Fatal("expected loadDesiredNodeMetadata to reject invalid json")
|
||||
}
|
||||
}
|
||||
|
||||
func TestServiceReplacementAndDeviceBranches(t *testing.T) {
|
||||
|
||||
@ -162,6 +162,7 @@ nodes:
|
||||
snapshotsPath := filepath.Join(dir, "snapshots.json")
|
||||
targetsPath := filepath.Join(dir, "targets.json")
|
||||
artifactStatePath := filepath.Join(dir, "artifacts.json")
|
||||
desiredMetadataPath := filepath.Join(dir, "desired-node-metadata.json")
|
||||
|
||||
seedSnapshots := map[string]SnapshotRecord{
|
||||
"titan-15": {
|
||||
@ -190,19 +191,33 @@ nodes:
|
||||
if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
seedDesiredMetadata := map[string]DesiredNodeMetadata{
|
||||
"titan-15": {
|
||||
Node: "titan-15",
|
||||
Hostname: "titan-15",
|
||||
CapturedAt: testTime(t),
|
||||
Labels: map[string]string{"hardware": "rpi5"},
|
||||
Taints: []string{"dedicated=recovery:NoSchedule"},
|
||||
},
|
||||
}
|
||||
data, _ = json.MarshalIndent(seedDesiredMetadata, "", " ")
|
||||
if err := os.WriteFile(desiredMetadataPath, data, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
app, err := NewApp(Settings{
|
||||
InventoryPath: invPath,
|
||||
CacheDir: filepath.Join(dir, "cache"),
|
||||
ArtifactDir: filepath.Join(dir, "artifacts"),
|
||||
ArtifactStatePath: artifactStatePath,
|
||||
HistoryPath: filepath.Join(dir, "history.jsonl"),
|
||||
SnapshotsPath: snapshotsPath,
|
||||
TargetsPath: targetsPath,
|
||||
DefaultFlashHost: "titan-22",
|
||||
FlashHosts: []string{"titan-22"},
|
||||
LocalHost: "titan-22",
|
||||
AllowedGroups: []string{"admin"},
|
||||
InventoryPath: invPath,
|
||||
CacheDir: filepath.Join(dir, "cache"),
|
||||
ArtifactDir: filepath.Join(dir, "artifacts"),
|
||||
ArtifactStatePath: artifactStatePath,
|
||||
HistoryPath: filepath.Join(dir, "history.jsonl"),
|
||||
SnapshotsPath: snapshotsPath,
|
||||
TargetsPath: targetsPath,
|
||||
DesiredMetadataPath: desiredMetadataPath,
|
||||
DefaultFlashHost: "titan-22",
|
||||
FlashHosts: []string{"titan-22"},
|
||||
LocalHost: "titan-22",
|
||||
AllowedGroups: []string{"admin"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("NewApp: %v", err)
|
||||
@ -211,6 +226,9 @@ nodes:
|
||||
if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" {
|
||||
t.Fatalf("artifacts() = %q", got)
|
||||
}
|
||||
if desired, ok := app.desiredMetadataForNode("titan-15"); !ok || desired.Labels["hardware"] != "rpi5" {
|
||||
t.Fatalf("desiredMetadataForNode() = %#v ok=%v", desired, ok)
|
||||
}
|
||||
if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil {
|
||||
t.Fatalf("recordArtifact: %v", err)
|
||||
}
|
||||
|
||||
483
pkg/service/node_recovery.go
Normal file
483
pkg/service/node_recovery.go
Normal file
@ -0,0 +1,483 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"metis/pkg/config"
|
||||
)
|
||||
|
||||
// DesiredNodeMetadata captures the node identity Metis should preserve through
|
||||
// recovery builds and re-assert after the node rejoins the cluster.
|
||||
type DesiredNodeMetadata struct {
|
||||
Node string `json:"node"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
CapturedAt time.Time `json:"captured_at,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations,omitempty"`
|
||||
Taints []string `json:"taints,omitempty"`
|
||||
Unschedulable bool `json:"unschedulable,omitempty"`
|
||||
}
|
||||
|
||||
func (a *App) loadDesiredNodeMetadata() error {
|
||||
data, err := os.ReadFile(a.settings.DesiredMetadataPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var desired map[string]DesiredNodeMetadata
|
||||
if err := json.Unmarshal(data, &desired); err != nil {
|
||||
return err
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.desiredMetadata = desired
|
||||
a.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *App) persistDesiredNodeMetadata() error {
|
||||
a.mu.RLock()
|
||||
data, err := json.MarshalIndent(a.desiredMetadata, "", " ")
|
||||
a.mu.RUnlock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(a.settings.DesiredMetadataPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(a.settings.DesiredMetadataPath, data, 0o644)
|
||||
}
|
||||
|
||||
func (a *App) desiredMetadataForNode(node string) (DesiredNodeMetadata, bool) {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
return DesiredNodeMetadata{}, false
|
||||
}
|
||||
a.mu.RLock()
|
||||
defer a.mu.RUnlock()
|
||||
desired, ok := a.desiredMetadata[node]
|
||||
if !ok {
|
||||
return DesiredNodeMetadata{}, false
|
||||
}
|
||||
return cloneDesiredNodeMetadata(desired), true
|
||||
}
|
||||
|
||||
func (a *App) stageDesiredNodeMetadata(nodeName string) (DesiredNodeMetadata, error) {
|
||||
nodeName = strings.TrimSpace(nodeName)
|
||||
if nodeName == "" {
|
||||
return DesiredNodeMetadata{}, fmt.Errorf("node metadata requires a node name")
|
||||
}
|
||||
nodeSpec, _, err := a.inventory.FindNode(nodeName)
|
||||
if err != nil {
|
||||
return DesiredNodeMetadata{}, err
|
||||
}
|
||||
cfg, err := config.Build(a.inventory, nodeName)
|
||||
if err != nil {
|
||||
return DesiredNodeMetadata{}, err
|
||||
}
|
||||
desired := DesiredNodeMetadata{
|
||||
Node: nodeName,
|
||||
Hostname: strings.TrimSpace(nodeSpec.Hostname),
|
||||
CapturedAt: time.Now().UTC(),
|
||||
Labels: filteredRestorableLabels(cfg.Labels),
|
||||
Taints: restorableTaints(cfg.Taints),
|
||||
}
|
||||
if existing, ok := a.desiredMetadataForNode(nodeName); ok {
|
||||
desired = mergeDesiredNodeMetadata(desired, existing)
|
||||
}
|
||||
if live, ok := liveClusterNode(nodeName); ok {
|
||||
desired = mergeDesiredNodeMetadata(desired, desiredMetadataFromCluster(*live))
|
||||
}
|
||||
desired.Labels = normalizeStringMap(desired.Labels)
|
||||
desired.Annotations = normalizeStringMap(desired.Annotations)
|
||||
desired.Taints = normalizeTaints(desired.Taints)
|
||||
a.mu.Lock()
|
||||
if a.desiredMetadata == nil {
|
||||
a.desiredMetadata = map[string]DesiredNodeMetadata{}
|
||||
}
|
||||
a.desiredMetadata[nodeName] = desired
|
||||
a.mu.Unlock()
|
||||
if err := a.persistDesiredNodeMetadata(); err != nil {
|
||||
return DesiredNodeMetadata{}, err
|
||||
}
|
||||
return cloneDesiredNodeMetadata(desired), nil
|
||||
}
|
||||
|
||||
func (a *App) syncDesiredNodeMetadata(record SnapshotRecord) error {
|
||||
desired, ok := a.desiredMetadataForNode(record.Node)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
live, ok := liveClusterNode(record.Node)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return patchDesiredNodeMetadata(*live, desired)
|
||||
}
|
||||
|
||||
func desiredMetadataFromCluster(node clusterNode) DesiredNodeMetadata {
|
||||
return DesiredNodeMetadata{
|
||||
Node: strings.TrimSpace(node.Name),
|
||||
Labels: filteredRestorableLabels(node.Labels),
|
||||
Annotations: filteredRestorableAnnotations(node.Annotations),
|
||||
Taints: restorableTaints(node.Taints),
|
||||
Unschedulable: node.Unschedulable,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeDesiredNodeMetadata(base, overlay DesiredNodeMetadata) DesiredNodeMetadata {
|
||||
merged := cloneDesiredNodeMetadata(base)
|
||||
if hostname := strings.TrimSpace(overlay.Hostname); hostname != "" {
|
||||
merged.Hostname = hostname
|
||||
}
|
||||
if !overlay.CapturedAt.IsZero() {
|
||||
merged.CapturedAt = overlay.CapturedAt
|
||||
}
|
||||
if merged.Labels == nil {
|
||||
merged.Labels = map[string]string{}
|
||||
}
|
||||
for key, value := range overlay.Labels {
|
||||
if key = strings.TrimSpace(key); key == "" {
|
||||
continue
|
||||
}
|
||||
merged.Labels[key] = strings.TrimSpace(value)
|
||||
}
|
||||
if merged.Annotations == nil {
|
||||
merged.Annotations = map[string]string{}
|
||||
}
|
||||
for key, value := range overlay.Annotations {
|
||||
if key = strings.TrimSpace(key); key == "" {
|
||||
continue
|
||||
}
|
||||
merged.Annotations[key] = strings.TrimSpace(value)
|
||||
}
|
||||
if len(overlay.Taints) > 0 {
|
||||
merged.Taints = normalizeTaints(overlay.Taints)
|
||||
}
|
||||
merged.Unschedulable = overlay.Unschedulable
|
||||
return merged
|
||||
}
|
||||
|
||||
func patchDesiredNodeMetadata(live clusterNode, desired DesiredNodeMetadata) error {
|
||||
node := strings.TrimSpace(desired.Node)
|
||||
if node == "" {
|
||||
node = strings.TrimSpace(live.Name)
|
||||
}
|
||||
if node == "" {
|
||||
return nil
|
||||
}
|
||||
labelPatch := metadataStringPatch(live.Labels, desired.Labels, isRestorableLabel)
|
||||
annotationPatch := metadataStringPatch(live.Annotations, desired.Annotations, isRestorableAnnotation)
|
||||
mergedTaints := mergeLiveAndDesiredTaints(live.Taints, desired.Taints)
|
||||
body := map[string]any{}
|
||||
metadata := map[string]any{}
|
||||
if len(labelPatch) > 0 {
|
||||
metadata["labels"] = labelPatch
|
||||
}
|
||||
if len(annotationPatch) > 0 {
|
||||
metadata["annotations"] = annotationPatch
|
||||
}
|
||||
if len(metadata) > 0 {
|
||||
body["metadata"] = metadata
|
||||
}
|
||||
spec := map[string]any{}
|
||||
if live.Unschedulable != desired.Unschedulable {
|
||||
spec["unschedulable"] = desired.Unschedulable
|
||||
}
|
||||
if !sameTaints(live.Taints, mergedTaints) {
|
||||
spec["taints"] = taintPatchPayload(mergedTaints)
|
||||
}
|
||||
if len(spec) > 0 {
|
||||
body["spec"] = spec
|
||||
}
|
||||
if len(body) == 0 {
|
||||
return nil
|
||||
}
|
||||
kube, err := kubeClientFactory()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return kube.mergePatch("/api/v1/nodes/"+node, body)
|
||||
}
|
||||
|
||||
func metadataStringPatch(live, desired map[string]string, allow func(string) bool) map[string]any {
|
||||
patch := map[string]any{}
|
||||
for key, value := range desired {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" || !allow(key) {
|
||||
continue
|
||||
}
|
||||
value = strings.TrimSpace(value)
|
||||
if strings.TrimSpace(live[key]) != value {
|
||||
patch[key] = value
|
||||
}
|
||||
}
|
||||
for key := range live {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" || !allow(key) {
|
||||
continue
|
||||
}
|
||||
if _, ok := desired[key]; !ok {
|
||||
patch[key] = nil
|
||||
}
|
||||
}
|
||||
return patch
|
||||
}
|
||||
|
||||
func liveClusterNode(node string) (*clusterNode, bool) {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
return nil, false
|
||||
}
|
||||
for _, live := range clusterNodes() {
|
||||
if strings.TrimSpace(live.Name) == node {
|
||||
copyNode := live
|
||||
return ©Node, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
func filteredRestorableLabels(values map[string]string) map[string]string {
|
||||
filtered := map[string]string{}
|
||||
for key, value := range values {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" || !isRestorableLabel(key) {
|
||||
continue
|
||||
}
|
||||
filtered[key] = strings.TrimSpace(value)
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func filteredRestorableAnnotations(values map[string]string) map[string]string {
|
||||
filtered := map[string]string{}
|
||||
for key, value := range values {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" || !isRestorableAnnotation(key) {
|
||||
continue
|
||||
}
|
||||
filtered[key] = strings.TrimSpace(value)
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func normalizeStringMap(values map[string]string) map[string]string {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
normalized := map[string]string{}
|
||||
for key, value := range values {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
normalized[key] = strings.TrimSpace(value)
|
||||
}
|
||||
if len(normalized) == 0 {
|
||||
return nil
|
||||
}
|
||||
return normalized
|
||||
}
|
||||
|
||||
func restorableTaints(values []string) []string {
|
||||
filtered := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
value = normalizeTaint(value)
|
||||
if value == "" || !isRestorableTaint(value) {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, value)
|
||||
}
|
||||
return normalizeTaints(filtered)
|
||||
}
|
||||
|
||||
func normalizeTaints(values []string) []string {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
value = normalizeTaint(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Strings(out)
|
||||
if len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func normalizeTaint(value string) string {
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
|
||||
func sameTaints(left, right []string) bool {
|
||||
left = normalizeTaints(left)
|
||||
right = normalizeTaints(right)
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for idx := range left {
|
||||
if left[idx] != right[idx] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func mergeLiveAndDesiredTaints(live, desired []string) []string {
|
||||
merged := make([]string, 0, len(live)+len(desired))
|
||||
for _, taint := range live {
|
||||
taint = normalizeTaint(taint)
|
||||
if taint == "" || isRestorableTaint(taint) {
|
||||
continue
|
||||
}
|
||||
merged = append(merged, taint)
|
||||
}
|
||||
merged = append(merged, restorableTaints(desired)...)
|
||||
return normalizeTaints(merged)
|
||||
}
|
||||
|
||||
func taintPatchPayload(values []string) []map[string]string {
|
||||
payload := make([]map[string]string, 0, len(values))
|
||||
for _, value := range normalizeTaints(values) {
|
||||
key, taintValue, effect := splitTaint(value)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
entry := map[string]string{"key": key}
|
||||
if taintValue != "" {
|
||||
entry["value"] = taintValue
|
||||
}
|
||||
if effect != "" {
|
||||
entry["effect"] = effect
|
||||
}
|
||||
payload = append(payload, entry)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
func splitTaint(raw string) (string, string, string) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return "", "", ""
|
||||
}
|
||||
effect := ""
|
||||
body := raw
|
||||
if idx := strings.LastIndex(raw, ":"); idx >= 0 {
|
||||
body = strings.TrimSpace(raw[:idx])
|
||||
effect = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
key := body
|
||||
value := ""
|
||||
if idx := strings.Index(body, "="); idx >= 0 {
|
||||
key = strings.TrimSpace(body[:idx])
|
||||
value = strings.TrimSpace(body[idx+1:])
|
||||
}
|
||||
return strings.TrimSpace(key), value, effect
|
||||
}
|
||||
|
||||
func isRestorableTaint(raw string) bool {
|
||||
key, _, _ := splitTaint(raw)
|
||||
if key == "" {
|
||||
return false
|
||||
}
|
||||
for _, prefix := range []string{
|
||||
"node.kubernetes.io/",
|
||||
"node.cloudprovider.kubernetes.io/",
|
||||
"ToBeDeletedByClusterAutoscaler",
|
||||
} {
|
||||
if strings.HasPrefix(key, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func isRestorableLabel(key string) bool {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(key, "node-role.kubernetes.io/") {
|
||||
return true
|
||||
}
|
||||
for _, prefix := range []string{
|
||||
"kubernetes.io/",
|
||||
"beta.kubernetes.io/",
|
||||
"node.kubernetes.io/",
|
||||
"topology.kubernetes.io/",
|
||||
"feature.node.kubernetes.io/",
|
||||
"failure-domain.beta.kubernetes.io/",
|
||||
"nvidia.com/",
|
||||
"k3s.io/",
|
||||
"rke2.io/",
|
||||
"volumes.kubernetes.io/",
|
||||
"node.cloudprovider.kubernetes.io/",
|
||||
} {
|
||||
if strings.HasPrefix(key, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func isRestorableAnnotation(key string) bool {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" {
|
||||
return false
|
||||
}
|
||||
for _, prefix := range []string{
|
||||
"kubectl.kubernetes.io/",
|
||||
"kubeadm.alpha.kubernetes.io/",
|
||||
"kubernetes.io/",
|
||||
"node.alpha.kubernetes.io/",
|
||||
"node.kubernetes.io/",
|
||||
"volumes.kubernetes.io/",
|
||||
"csi.volume.kubernetes.io/",
|
||||
"csi.storage.k8s.io/",
|
||||
"flannel.alpha.coreos.com/",
|
||||
"projectcalico.org/",
|
||||
"rke2.io/",
|
||||
"k3s.io/",
|
||||
"nvidia.com/",
|
||||
} {
|
||||
if strings.HasPrefix(key, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func cloneDesiredNodeMetadata(value DesiredNodeMetadata) DesiredNodeMetadata {
|
||||
clone := value
|
||||
clone.Labels = normalizeStringMap(value.Labels)
|
||||
clone.Annotations = normalizeStringMap(value.Annotations)
|
||||
clone.Taints = normalizeTaints(value.Taints)
|
||||
return clone
|
||||
}
|
||||
|
||||
func desiredNodeMetadataSyncEvent(node string, err error) Event {
|
||||
return Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "sentinel.node-metadata",
|
||||
Summary: fmt.Sprintf("Could not restore desired node metadata for %s", node),
|
||||
Details: map[string]any{
|
||||
"node": node,
|
||||
"error": err.Error(),
|
||||
},
|
||||
}
|
||||
}
|
||||
254
pkg/service/node_recovery_test.go
Normal file
254
pkg/service/node_recovery_test.go
Normal file
@ -0,0 +1,254 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"metis/pkg/sentinel"
|
||||
)
|
||||
|
||||
func TestStageDesiredNodeMetadataMergesInventoryAndLiveCluster(t *testing.T) {
|
||||
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"items": []any{
|
||||
map[string]any{
|
||||
"metadata": map[string]any{
|
||||
"name": "titan-15",
|
||||
"labels": map[string]string{
|
||||
"hardware": "rpi5",
|
||||
"rack": "a1",
|
||||
"maintenance.bstein.dev/color": "blue",
|
||||
"kubernetes.io/arch": "arm64",
|
||||
"node-role.kubernetes.io/worker": "true",
|
||||
},
|
||||
"annotations": map[string]string{
|
||||
"maintenance.bstein.dev/owner": "atlas",
|
||||
"volumes.kubernetes.io/controller-managed-attach-detach": "true",
|
||||
},
|
||||
},
|
||||
"spec": map[string]any{
|
||||
"unschedulable": true,
|
||||
"taints": []any{
|
||||
map[string]any{"key": "dedicated", "value": "recovery", "effect": "NoSchedule"},
|
||||
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer kube.Close()
|
||||
installKubeFactory(t, kube)
|
||||
|
||||
app := newTestApp(t)
|
||||
app.inventory.Nodes[0].Labels = map[string]string{"hardware": "rpi4", "rack": "a1"}
|
||||
app.inventory.Nodes[0].Taints = []string{"flash=true:NoSchedule"}
|
||||
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
|
||||
Node: "titan-15",
|
||||
Annotations: map[string]string{"maintenance.bstein.dev/legacy": "keep"},
|
||||
}
|
||||
|
||||
desired, err := app.stageDesiredNodeMetadata("titan-15")
|
||||
if err != nil {
|
||||
t.Fatalf("stageDesiredNodeMetadata: %v", err)
|
||||
}
|
||||
if desired.Hostname != "titan-15" || !desired.Unschedulable {
|
||||
t.Fatalf("unexpected desired metadata header: %#v", desired)
|
||||
}
|
||||
if desired.Labels["hardware"] != "rpi5" || desired.Labels["rack"] != "a1" || desired.Labels["maintenance.bstein.dev/color"] != "blue" {
|
||||
t.Fatalf("unexpected desired labels: %#v", desired.Labels)
|
||||
}
|
||||
if _, ok := desired.Labels["kubernetes.io/arch"]; ok {
|
||||
t.Fatalf("system labels should not be persisted: %#v", desired.Labels)
|
||||
}
|
||||
if desired.Annotations["maintenance.bstein.dev/owner"] != "atlas" || desired.Annotations["maintenance.bstein.dev/legacy"] != "keep" {
|
||||
t.Fatalf("unexpected desired annotations: %#v", desired.Annotations)
|
||||
}
|
||||
if _, ok := desired.Annotations["volumes.kubernetes.io/controller-managed-attach-detach"]; ok {
|
||||
t.Fatalf("controller annotations should not be persisted: %#v", desired.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(desired.Taints, []string{"dedicated=recovery:NoSchedule"}) {
|
||||
t.Fatalf("unexpected desired taints: %#v", desired.Taints)
|
||||
}
|
||||
data, err := os.ReadFile(app.settings.DesiredMetadataPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read desired metadata file: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(data), "titan-15") {
|
||||
t.Fatalf("desired metadata file missing titan-15: %s", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestStoreSnapshotRestoresDesiredNodeMetadata(t *testing.T) {
|
||||
var patchBody map[string]any
|
||||
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"items": []any{
|
||||
map[string]any{
|
||||
"metadata": map[string]any{
|
||||
"name": "titan-15",
|
||||
"labels": map[string]string{
|
||||
"hardware": "rpi4",
|
||||
"maintenance.bstein.dev/old": "1",
|
||||
},
|
||||
"annotations": map[string]string{
|
||||
"maintenance.bstein.dev/mode": "old",
|
||||
},
|
||||
},
|
||||
"spec": map[string]any{
|
||||
"unschedulable": true,
|
||||
"taints": []any{
|
||||
map[string]any{"key": "dedicated", "value": "old", "effect": "NoSchedule"},
|
||||
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/nodes/titan-15":
|
||||
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
|
||||
t.Fatalf("decode patch: %v", err)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer kube.Close()
|
||||
installKubeFactory(t, kube)
|
||||
|
||||
app := newTestApp(t)
|
||||
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
|
||||
Node: "titan-15",
|
||||
Hostname: "titan-15",
|
||||
Labels: map[string]string{"hardware": "rpi5"},
|
||||
Annotations: map[string]string{"maintenance.bstein.dev/mode": "recovery"},
|
||||
Taints: []string{"dedicated=recovery:NoSchedule"},
|
||||
Unschedulable: false,
|
||||
}
|
||||
|
||||
if err := app.StoreSnapshot(SnapshotRecord{
|
||||
Node: "titan-15",
|
||||
CollectedAt: time.Date(2026, 4, 24, 6, 0, 0, 0, time.UTC),
|
||||
Snapshot: sentinel.Snapshot{Hostname: "titan-15"},
|
||||
}); err != nil {
|
||||
t.Fatalf("StoreSnapshot: %v", err)
|
||||
}
|
||||
if patchBody == nil {
|
||||
t.Fatal("expected desired metadata patch")
|
||||
}
|
||||
metadata := patchBody["metadata"].(map[string]any)
|
||||
labels := metadata["labels"].(map[string]any)
|
||||
if labels["hardware"] != "rpi5" || labels["maintenance.bstein.dev/old"] != nil {
|
||||
t.Fatalf("unexpected label patch: %#v", labels)
|
||||
}
|
||||
annotations := metadata["annotations"].(map[string]any)
|
||||
if annotations["maintenance.bstein.dev/mode"] != "recovery" {
|
||||
t.Fatalf("unexpected annotation patch: %#v", annotations)
|
||||
}
|
||||
spec := patchBody["spec"].(map[string]any)
|
||||
if spec["unschedulable"] != false {
|
||||
t.Fatalf("unexpected spec patch: %#v", spec)
|
||||
}
|
||||
taints := spec["taints"].([]any)
|
||||
if len(taints) != 2 {
|
||||
t.Fatalf("unexpected taint payload: %#v", taints)
|
||||
}
|
||||
entries := map[string]map[string]any{}
|
||||
for _, raw := range taints {
|
||||
entry := raw.(map[string]any)
|
||||
key := entry["key"].(string)
|
||||
entries[key] = entry
|
||||
}
|
||||
if entries["dedicated"]["value"] != "recovery" || entries["dedicated"]["effect"] != "NoSchedule" {
|
||||
t.Fatalf("missing desired taint replacement: %#v", entries)
|
||||
}
|
||||
if entries["node.kubernetes.io/unreachable"]["effect"] != "NoExecute" {
|
||||
t.Fatalf("system taint should be preserved: %#v", entries)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDesiredNodeMetadataHelpers(t *testing.T) {
|
||||
app := newTestApp(t)
|
||||
if _, ok := app.desiredMetadataForNode("missing"); ok {
|
||||
t.Fatal("expected no desired metadata for missing node")
|
||||
}
|
||||
if err := app.syncDesiredNodeMetadata(SnapshotRecord{Node: "missing"}); err != nil {
|
||||
t.Fatalf("syncDesiredNodeMetadata missing should noop: %v", err)
|
||||
}
|
||||
if _, ok := liveClusterNode(""); ok {
|
||||
t.Fatal("empty liveClusterNode lookup should fail")
|
||||
}
|
||||
if !isRestorableLabel("maintenance.bstein.dev/role") || isRestorableLabel("kubernetes.io/arch") {
|
||||
t.Fatal("unexpected label restoration filter")
|
||||
}
|
||||
if !isRestorableAnnotation("maintenance.bstein.dev/state") || isRestorableAnnotation("volumes.kubernetes.io/foo") {
|
||||
t.Fatal("unexpected annotation restoration filter")
|
||||
}
|
||||
if !isRestorableTaint("dedicated=recovery:NoSchedule") || isRestorableTaint("node.kubernetes.io/not-ready:NoExecute") {
|
||||
t.Fatal("unexpected taint restoration filter")
|
||||
}
|
||||
key, value, effect := splitTaint("dedicated=recovery:NoSchedule")
|
||||
if key != "dedicated" || value != "recovery" || effect != "NoSchedule" {
|
||||
t.Fatalf("splitTaint mismatch: %q %q %q", key, value, effect)
|
||||
}
|
||||
if key, value, effect := splitTaint("just-a-key"); key != "just-a-key" || value != "" || effect != "" {
|
||||
t.Fatalf("splitTaint key-only mismatch: %q %q %q", key, value, effect)
|
||||
}
|
||||
labels := filteredRestorableLabels(map[string]string{"hardware": "rpi5", "kubernetes.io/arch": "arm64"})
|
||||
if !reflect.DeepEqual(labels, map[string]string{"hardware": "rpi5"}) {
|
||||
t.Fatalf("filteredRestorableLabels = %#v", labels)
|
||||
}
|
||||
annotations := filteredRestorableAnnotations(map[string]string{"maintenance.bstein.dev/state": "ok", "volumes.kubernetes.io/foo": "bar"})
|
||||
if !reflect.DeepEqual(annotations, map[string]string{"maintenance.bstein.dev/state": "ok"}) {
|
||||
t.Fatalf("filteredRestorableAnnotations = %#v", annotations)
|
||||
}
|
||||
patch := metadataStringPatch(
|
||||
map[string]string{"hardware": "rpi4", "maintenance.bstein.dev/old": "1"},
|
||||
map[string]string{"hardware": "rpi5"},
|
||||
isRestorableLabel,
|
||||
)
|
||||
if patch["hardware"] != "rpi5" || patch["maintenance.bstein.dev/old"] != nil {
|
||||
t.Fatalf("metadataStringPatch = %#v", patch)
|
||||
}
|
||||
mergedTaints := mergeLiveAndDesiredTaints(
|
||||
[]string{"node.kubernetes.io/unreachable:NoExecute", "dedicated=old:NoSchedule"},
|
||||
[]string{"dedicated=new:NoSchedule", "dedicated=new:NoSchedule"},
|
||||
)
|
||||
if !reflect.DeepEqual(mergedTaints, []string{"dedicated=new:NoSchedule", "node.kubernetes.io/unreachable:NoExecute"}) {
|
||||
t.Fatalf("mergeLiveAndDesiredTaints = %#v", mergedTaints)
|
||||
}
|
||||
payload := taintPatchPayload([]string{"dedicated=new:NoSchedule"})
|
||||
if len(payload) != 1 || payload[0]["key"] != "dedicated" || payload[0]["value"] != "new" || payload[0]["effect"] != "NoSchedule" {
|
||||
t.Fatalf("taintPatchPayload = %#v", payload)
|
||||
}
|
||||
original := DesiredNodeMetadata{Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}
|
||||
cloned := cloneDesiredNodeMetadata(original)
|
||||
cloned.Labels["hardware"] = "mutated"
|
||||
cloned.Taints[0] = "changed"
|
||||
if original.Labels["hardware"] != "rpi5" || original.Taints[0] != "dedicated=new:NoSchedule" {
|
||||
t.Fatalf("cloneDesiredNodeMetadata should deep-copy slices/maps: %#v %#v", original, cloned)
|
||||
}
|
||||
if err := patchDesiredNodeMetadata(
|
||||
clusterNode{Name: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
|
||||
DesiredNodeMetadata{Node: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
|
||||
); err != nil {
|
||||
t.Fatalf("patchDesiredNodeMetadata should noop when already in sync: %v", err)
|
||||
}
|
||||
if event := desiredNodeMetadataSyncEvent("titan-15", os.ErrPermission); event.Kind != "sentinel.node-metadata" || event.Details["node"] != "titan-15" {
|
||||
t.Fatalf("desiredNodeMetadataSyncEvent = %#v", event)
|
||||
}
|
||||
}
|
||||
@ -74,12 +74,17 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
|
||||
}
|
||||
|
||||
func (a *App) runBuild(job *Job, flash bool) {
|
||||
_, class, err := a.inventory.FindNode(job.Node)
|
||||
nodeSpec, class, err := a.inventory.FindNode(job.Node)
|
||||
if err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
if err := a.ensureHarborProject(); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
@ -112,7 +117,8 @@ func (a *App) runBuild(job *Job, flash bool) {
|
||||
return
|
||||
}
|
||||
buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano())
|
||||
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag))
|
||||
job.Builder = builder.Name
|
||||
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, strings.TrimSpace(nodeSpec.Hostname), artifactRef, buildTag))
|
||||
if err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
@ -183,6 +189,9 @@ func (a *App) runFlash(job *Job) {
|
||||
}
|
||||
|
||||
func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
|
||||
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
|
||||
return RemoteFlashResult{}, err
|
||||
}
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Status = JobRunning
|
||||
j.Stage = "preflight"
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
@ -246,8 +247,9 @@ func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any {
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any {
|
||||
func (a *App) remoteBuildPodSpec(name, host, image, node, nodeHostname, artifactRef, buildTag string) map[string]any {
|
||||
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
|
||||
desiredEnv := remoteDesiredMetadataEnv(a, node)
|
||||
return map[string]any{
|
||||
"apiVersion": "v1",
|
||||
"kind": "Pod",
|
||||
@ -255,7 +257,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
|
||||
"name": name,
|
||||
"namespace": a.settings.Namespace,
|
||||
"labels": map[string]string{"app": "metis-remote", "metis-run": "build"},
|
||||
"annotations": vaultRuntimeAnnotations(true),
|
||||
"annotations": vaultRuntimeAnnotations(true, nodeHostname),
|
||||
},
|
||||
"spec": map[string]any{
|
||||
"restartPolicy": "Never",
|
||||
@ -283,6 +285,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
|
||||
),
|
||||
},
|
||||
"securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0},
|
||||
"env": desiredEnv,
|
||||
"envFrom": []map[string]any{
|
||||
{"configMapRef": map[string]any{"name": "metis"}},
|
||||
},
|
||||
@ -309,7 +312,7 @@ func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef st
|
||||
"name": name,
|
||||
"namespace": a.settings.Namespace,
|
||||
"labels": map[string]string{"app": "metis-remote", "metis-run": "flash"},
|
||||
"annotations": vaultRuntimeAnnotations(false),
|
||||
"annotations": vaultRuntimeAnnotations(false, ""),
|
||||
},
|
||||
"spec": map[string]any{
|
||||
"restartPolicy": "Never",
|
||||
@ -378,7 +381,46 @@ func mountedHostTmpDir(path string) string {
|
||||
return "/host-tmp"
|
||||
}
|
||||
|
||||
func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string {
|
||||
func remoteDesiredMetadataEnv(a *App, node string) []map[string]any {
|
||||
desired, ok := a.desiredMetadataForNode(node)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
labelsJSON, _ := jsonMarshalStringMap(desired.Labels)
|
||||
taintsJSON, _ := jsonMarshalStringSlice(desired.Taints)
|
||||
env := []map[string]any{}
|
||||
if labelsJSON != "" {
|
||||
env = append(env, map[string]any{"name": "METIS_NODE_LABELS_JSON", "value": labelsJSON})
|
||||
}
|
||||
if taintsJSON != "" {
|
||||
env = append(env, map[string]any{"name": "METIS_NODE_TAINTS_JSON", "value": taintsJSON})
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
func jsonMarshalStringMap(values map[string]string) (string, error) {
|
||||
if len(values) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
data, err := json.Marshal(values)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func jsonMarshalStringSlice(values []string) (string, error) {
|
||||
if len(values) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
data, err := json.Marshal(values)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func vaultRuntimeAnnotations(includeSSHKeys bool, nodeHostname string) map[string]string {
|
||||
annotations := map[string]string{
|
||||
"vault.hashicorp.com/agent-inject": "true",
|
||||
"vault.hashicorp.com/agent-pre-populate-only": "true",
|
||||
@ -399,6 +441,19 @@ export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
|
||||
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
|
||||
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
|
||||
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
|
||||
{{ end }}`
|
||||
}
|
||||
nodeHostname = strings.TrimSpace(nodeHostname)
|
||||
if nodeHostname != "" {
|
||||
secretPath := fmt.Sprintf("secret/data/nodes/%s", nodeHostname)
|
||||
annotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] = secretPath
|
||||
annotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"] = `{{ with secret "` + secretPath + `" }}
|
||||
export METIS_NODE_SSH_PASSWORD="{{ .Data.data.ssh_password }}"
|
||||
export METIS_NODE_SSH_PASSWORD_HASH="{{ .Data.data.ssh_password_hash }}"
|
||||
export METIS_NODE_ATLAS_PASSWORD="{{ .Data.data.atlas_password }}"
|
||||
export METIS_NODE_ATLAS_PASSWORD_HASH="{{ .Data.data.atlas_password_hash }}"
|
||||
export METIS_NODE_ROOT_PASSWORD="{{ .Data.data.root_password }}"
|
||||
export METIS_NODE_ROOT_PASSWORD_HASH="{{ .Data.data.root_password_hash }}"
|
||||
{{ end }}`
|
||||
}
|
||||
return annotations
|
||||
@ -413,6 +468,7 @@ func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string {
|
||||
if includeSSHKeys {
|
||||
lines = append(lines, ". /vault/secrets/metis-ssh-env.sh")
|
||||
}
|
||||
lines = append(lines, "if [ -f /vault/secrets/metis-node-secrets-env.sh ]; then . /vault/secrets/metis-node-secrets-env.sh; fi")
|
||||
lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...))
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
|
||||
@ -251,8 +251,13 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
|
||||
app := newTestApp(t)
|
||||
app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace"
|
||||
app.settings.HostTmpDir = "/var/tmp/metis-flash-test"
|
||||
app.desiredMetadata["titan-10"] = DesiredNodeMetadata{
|
||||
Node: "titan-10",
|
||||
Labels: map[string]string{"hardware": "rpi5"},
|
||||
Taints: []string{"dedicated=recovery:NoSchedule"},
|
||||
}
|
||||
|
||||
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "registry.example/metis/titan-10", "build-1")
|
||||
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "titan-10", "registry.example/metis/titan-10", "build-1")
|
||||
buildBody := buildSpec["spec"].(map[string]any)
|
||||
buildVolumes := buildBody["volumes"].([]map[string]any)
|
||||
workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any)
|
||||
@ -260,6 +265,17 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
|
||||
t.Fatalf("build workspace hostPath = %v", got)
|
||||
}
|
||||
buildContainer := buildBody["containers"].([]map[string]any)[0]
|
||||
buildEnv := buildContainer["env"].([]map[string]any)
|
||||
if len(buildEnv) != 2 {
|
||||
t.Fatalf("expected desired metadata env, got %#v", buildEnv)
|
||||
}
|
||||
metadataAnnotations := buildSpec["metadata"].(map[string]any)["annotations"].(map[string]string)
|
||||
if metadataAnnotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] != "secret/data/nodes/titan-10" {
|
||||
t.Fatalf("unexpected node secret annotation: %#v", metadataAnnotations)
|
||||
}
|
||||
if !strings.Contains(metadataAnnotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"], "METIS_NODE_ROOT_PASSWORD") {
|
||||
t.Fatalf("expected node password exports in vault template: %#v", metadataAnnotations)
|
||||
}
|
||||
buildSecurity := buildContainer["securityContext"].(map[string]any)
|
||||
if got := buildSecurity["runAsUser"]; got != 0 {
|
||||
t.Fatalf("build runAsUser = %v", got)
|
||||
|
||||
@ -15,7 +15,7 @@ func TestMountedHostTmpDirMapsConfiguredTmpPathIntoMount(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
|
||||
withKeys := vaultRuntimeAnnotations(true)
|
||||
withKeys := vaultRuntimeAnnotations(true, "titan-15")
|
||||
template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]
|
||||
if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") {
|
||||
t.Fatalf("expected tethys hecate key export in vault template: %q", template)
|
||||
@ -24,7 +24,7 @@ func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
|
||||
t.Fatalf("expected db hecate key export in vault template: %q", template)
|
||||
}
|
||||
|
||||
withoutKeys := vaultRuntimeAnnotations(false)
|
||||
withoutKeys := vaultRuntimeAnnotations(false, "")
|
||||
if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok {
|
||||
t.Fatalf("did not expect ssh key template when includeSSHKeys=false")
|
||||
}
|
||||
|
||||
@ -11,30 +11,31 @@ var hostNameLookup = os.Hostname
|
||||
|
||||
// Settings configures the Metis service runtime.
|
||||
type Settings struct {
|
||||
BindAddr string
|
||||
InventoryPath string
|
||||
CacheDir string
|
||||
ArtifactDir string
|
||||
ArtifactStatePath string
|
||||
HistoryPath string
|
||||
SnapshotsPath string
|
||||
TargetsPath string
|
||||
DefaultFlashHost string
|
||||
FlashHosts []string
|
||||
LocalHost string
|
||||
AllowedGroups []string
|
||||
MaxDeviceBytes int64
|
||||
Namespace string
|
||||
RunnerImageAMD64 string
|
||||
RunnerImageARM64 string
|
||||
HarborRegistry string
|
||||
HarborProject string
|
||||
HarborAPIBase string
|
||||
HarborUsername string
|
||||
HarborPassword string
|
||||
HostTmpDir string
|
||||
RemoteWorkspaceDir string
|
||||
RemotePodTimeout int64
|
||||
BindAddr string
|
||||
InventoryPath string
|
||||
CacheDir string
|
||||
ArtifactDir string
|
||||
ArtifactStatePath string
|
||||
HistoryPath string
|
||||
SnapshotsPath string
|
||||
TargetsPath string
|
||||
DesiredMetadataPath string
|
||||
DefaultFlashHost string
|
||||
FlashHosts []string
|
||||
LocalHost string
|
||||
AllowedGroups []string
|
||||
MaxDeviceBytes int64
|
||||
Namespace string
|
||||
RunnerImageAMD64 string
|
||||
RunnerImageARM64 string
|
||||
HarborRegistry string
|
||||
HarborProject string
|
||||
HarborAPIBase string
|
||||
HarborUsername string
|
||||
HarborPassword string
|
||||
HostTmpDir string
|
||||
RemoteWorkspaceDir string
|
||||
RemotePodTimeout int64
|
||||
}
|
||||
|
||||
// FromEnv builds service settings with sensible defaults for local dev and in-cluster use.
|
||||
@ -44,30 +45,31 @@ func FromEnv() Settings {
|
||||
defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost)
|
||||
flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost))
|
||||
return Settings{
|
||||
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
|
||||
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
|
||||
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
|
||||
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
|
||||
ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")),
|
||||
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
|
||||
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
|
||||
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
|
||||
DefaultFlashHost: defaultFlashHost,
|
||||
FlashHosts: flashHosts,
|
||||
LocalHost: localHost,
|
||||
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")),
|
||||
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
|
||||
Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"),
|
||||
RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""),
|
||||
RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""),
|
||||
HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"),
|
||||
HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"),
|
||||
HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"),
|
||||
HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""),
|
||||
HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""),
|
||||
HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"),
|
||||
RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"),
|
||||
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800),
|
||||
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
|
||||
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
|
||||
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
|
||||
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
|
||||
ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")),
|
||||
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
|
||||
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
|
||||
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
|
||||
DesiredMetadataPath: getenvDefault("METIS_DESIRED_METADATA_PATH", filepath.Join(dataDir, "desired-node-metadata.json")),
|
||||
DefaultFlashHost: defaultFlashHost,
|
||||
FlashHosts: flashHosts,
|
||||
LocalHost: localHost,
|
||||
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")),
|
||||
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
|
||||
Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"),
|
||||
RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""),
|
||||
RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""),
|
||||
HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"),
|
||||
HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"),
|
||||
HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"),
|
||||
HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""),
|
||||
HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""),
|
||||
HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"),
|
||||
RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"),
|
||||
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user