diff --git a/overlays/rpi4-armbian-longhorn-root/usr/local/sbin/metis-rpi4-longhorn-firstboot.sh b/overlays/rpi4-armbian-longhorn-root/usr/local/sbin/metis-rpi4-longhorn-firstboot.sh index c1bd093..e4fb2b3 100755 --- a/overlays/rpi4-armbian-longhorn-root/usr/local/sbin/metis-rpi4-longhorn-firstboot.sh +++ b/overlays/rpi4-armbian-longhorn-root/usr/local/sbin/metis-rpi4-longhorn-firstboot.sh @@ -154,6 +154,10 @@ if [ -s "${sudoers_file}" ]; then fi fi +if [ -x /usr/local/sbin/metis-apply-node-identity.sh ]; then + /usr/local/sbin/metis-apply-node-identity.sh || true +fi + rm -f /root/.not_logged_in_yet if ! command -v k3s >/dev/null 2>&1; then diff --git a/pkg/plan/inject.go b/pkg/plan/inject.go index c5728e4..0d4f63c 100644 --- a/pkg/plan/inject.go +++ b/pkg/plan/inject.go @@ -2,8 +2,6 @@ package plan import ( "bytes" - "context" - "encoding/json" "fmt" "os" "path/filepath" @@ -64,6 +62,7 @@ func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error) cfg.Secrets = sec.Extra } } + applyNodeMetadataEnv(cfg) files, err := buildFiles(cfg, sec) if err != nil { return nil, err @@ -111,7 +110,9 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File {Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true}, {Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true}, {Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true}, - {Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true}, + {Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg, sec)), Mode: 0o600, RootFS: true}, + {Path: "usr/local/sbin/metis-apply-node-identity.sh", Content: []byte(nodeIdentityScriptContent()), Mode: 0o755, RootFS: true}, + {Path: "etc/cloud/cloud.cfg.d/90-metis-recovery.cfg", Content: []byte(cloudInitRootFSContent(sec)), Mode: 0o644, RootFS: true}, } if cfg.IP != "" { files = append(files, inject.FileSpec{ @@ -148,6 +149,14 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File RootFS: true, }) } + if passwordAuth := sshPasswordConfigContent(sec); passwordAuth != "" { + files = append(files, inject.FileSpec{ + Path: "etc/ssh/sshd_config.d/90-metis-password-auth.conf", + Content: []byte(passwordAuth), + Mode: 0o644, + RootFS: true, + }) + } if cfg.SSHUser == "atlas" { sudoers := hecateSudoersContent(cfg.SSHUser) files = append(files, inject.FileSpec{ @@ -172,8 +181,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File }) } - // Store the raw config for debugging/ops. - raw, err := json.MarshalIndent(cfg, "", " ") + raw, err := jsonMarshalIndent(cfg) if err != nil { return nil, err } @@ -184,7 +192,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File RootFS: true, }) if sec != nil { - secRaw, err := json.MarshalIndent(sec, "", " ") + secRaw, err := jsonMarshalIndent(redactedSecretsForImage(sec)) if err != nil { return nil, err } @@ -196,7 +204,6 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File }) } - // Optional cloud-init for images that honor NoCloud. userData := cloudInitUserData(cfg, sec) if userData != "" { files = append(files, inject.FileSpec{ @@ -267,33 +274,6 @@ func allowK3sNodeLabel(role, key string) bool { return !strings.HasPrefix(key, "node-role.kubernetes.io/") } -func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string { - if cfg == nil { - return "" - } - if sec != nil && sec.CloudInit != "" { - return sec.CloudInit - } - var b bytes.Buffer - b.WriteString("#cloud-config\n") - b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname)) - if len(cfg.SSHKeys) > 0 { - b.WriteString("ssh_authorized_keys:\n") - for _, k := range cfg.SSHKeys { - b.WriteString(fmt.Sprintf(" - %s\n", k)) - } - } - return b.String() -} - -func firstbootEnvContent(cfg *config.NodeConfig) string { - var b bytes.Buffer - b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname))) - b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser))) - b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version))) - return b.String() -} - func networkManagerConnectionContent(id, iface, ip string) string { gateway := ip if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 { @@ -347,7 +327,6 @@ func fstabAppendContent(cfg *config.NodeConfig) string { source := entry.Source switch { case source != "": - // Use the explicit source path for bind mounts. case entry.UUID != "": source = "UUID=" + entry.UUID case entry.Label != "": @@ -374,25 +353,6 @@ func hecateSudoersContent(user string) string { ) } -func shellQuote(value string) string { - if value == "" { - return "''" - } - return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'" -} - -func fetchSecrets(hostname string) *secrets.NodeSecrets { - if os.Getenv("VAULT_ADDR") == "" { - return nil - } - cli := secrets.NewFromEnv() - sec, err := cli.FetchNode(context.Background(), hostname) - if err != nil { - return nil - } - return sec -} - func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) { var files []inject.FileSpec if class == nil { diff --git a/pkg/plan/inject_test.go b/pkg/plan/inject_test.go index e42ba88..20b2f0d 100644 --- a/pkg/plan/inject_test.go +++ b/pkg/plan/inject_test.go @@ -174,3 +174,80 @@ func TestBuildFilesAddsHecateSudoersForAtlas(t *testing.T) { t.Fatalf("metis sudoers backup missing/incorrect: %s", backup) } } + +func TestBuildFilesAddsPasswordArtifactsAndRedactsSecrets(t *testing.T) { + cfg := &config.NodeConfig{ + Hostname: "titan-15", + IP: "192.168.22.43", + SSHUser: "atlas", + SSHKeys: []string{"ssh-ed25519 AAA test"}, + K3s: config.K3sConfig{ + Role: "agent", + Version: "v1.31.5+k3s1", + }, + } + sec := &secrets.NodeSecrets{ + SSHPassword: "atlas-pass", + RootPassword: "root-pass", + K3sToken: "super-secret-token", + Extra: map[string]string{"api_key": "secret"}, + } + files, err := buildFiles(cfg, sec) + if err != nil { + t.Fatalf("buildFiles: %v", err) + } + pathMap := map[string]string{} + for _, file := range files { + pathMap[file.Path] = string(file.Content) + } + firstboot := pathMap["etc/metis/firstboot.env"] + if !strings.Contains(firstboot, "METIS_ATLAS_PASSWORD='atlas-pass'") || !strings.Contains(firstboot, "METIS_ROOT_PASSWORD='root-pass'") { + t.Fatalf("firstboot env missing password material: %s", firstboot) + } + if sshd := pathMap["etc/ssh/sshd_config.d/90-metis-password-auth.conf"]; !strings.Contains(sshd, "PasswordAuthentication yes") || !strings.Contains(sshd, "PermitRootLogin yes") { + t.Fatalf("password auth config missing: %s", sshd) + } + if script := pathMap["usr/local/sbin/metis-apply-node-identity.sh"]; !strings.Contains(script, "apply_password root") || !strings.Contains(script, "METIS_ATLAS_PASSWORD") { + t.Fatalf("node identity script missing password application: %s", script) + } + if cloudCfg := pathMap["etc/cloud/cloud.cfg.d/90-metis-recovery.cfg"]; !strings.Contains(cloudCfg, "ssh_pwauth: true") { + t.Fatalf("cloud recovery config missing ssh_pwauth: %s", cloudCfg) + } + if userData := pathMap["user-data"]; !strings.Contains(userData, "ssh_pwauth: true") || !strings.Contains(userData, "metis-apply-node-identity.sh") { + t.Fatalf("cloud-init user-data missing recovery hooks: %s", userData) + } + secretsJSON := pathMap["etc/metis/secrets.json"] + if strings.Contains(secretsJSON, "atlas-pass") || strings.Contains(secretsJSON, "root-pass") || strings.Contains(secretsJSON, "super-secret-token") { + t.Fatalf("secrets.json should be redacted: %s", secretsJSON) + } + if !strings.Contains(secretsJSON, `"has_ssh_password": true`) || !strings.Contains(secretsJSON, `"extra_keys": [`) { + t.Fatalf("secrets.json should keep redacted debug metadata: %s", secretsJSON) + } +} + +func TestApplyNodeMetadataEnv(t *testing.T) { + cfg := &config.NodeConfig{ + Labels: map[string]string{"hardware": "rpi4"}, + Taints: []string{"flash=true:NoSchedule"}, + K3s: config.K3sConfig{ + Labels: map[string]string{"hardware": "rpi4"}, + Taints: []string{"flash=true:NoSchedule"}, + }, + } + t.Setenv("METIS_NODE_LABELS_JSON", `{"hardware":"rpi5","maintenance.bstein.dev/role":"recovery"}`) + t.Setenv("METIS_NODE_TAINTS_JSON", `["dedicated=recovery:NoSchedule","flash=true:NoSchedule"]`) + applyNodeMetadataEnv(cfg) + if cfg.Labels["hardware"] != "rpi5" || cfg.Labels["maintenance.bstein.dev/role"] != "recovery" { + t.Fatalf("applyNodeMetadataEnv labels = %#v", cfg.Labels) + } + if !strings.Contains(strings.Join(cfg.Taints, ","), "dedicated=recovery:NoSchedule") { + t.Fatalf("applyNodeMetadataEnv taints = %#v", cfg.Taints) + } + cfg = &config.NodeConfig{} + t.Setenv("METIS_NODE_LABELS_JSON", `{bad-json`) + t.Setenv("METIS_NODE_TAINTS_JSON", `{bad-json`) + applyNodeMetadataEnv(cfg) + if cfg.Labels != nil || cfg.Taints != nil { + t.Fatalf("invalid env JSON should be ignored: %#v", cfg) + } +} diff --git a/pkg/plan/node_identity.go b/pkg/plan/node_identity.go new file mode 100644 index 0000000..7813fff --- /dev/null +++ b/pkg/plan/node_identity.go @@ -0,0 +1,262 @@ +package plan + +import ( + "bytes" + "fmt" + "sort" + "strings" + + "metis/pkg/config" + "metis/pkg/secrets" +) + +func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string { + if cfg == nil { + return "" + } + if sec != nil && sec.CloudInit != "" { + return sec.CloudInit + } + var b bytes.Buffer + b.WriteString("#cloud-config\n") + b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname)) + if len(cfg.SSHKeys) > 0 { + b.WriteString("ssh_authorized_keys:\n") + for _, k := range cfg.SSHKeys { + b.WriteString(fmt.Sprintf(" - %s\n", k)) + } + } + if hasNodePasswords(sec) { + b.WriteString("ssh_pwauth: true\n") + b.WriteString("disable_root: false\n") + } + b.WriteString("runcmd:\n") + b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n") + return b.String() +} + +func firstbootEnvContent(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string { + var b bytes.Buffer + b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname))) + b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser))) + b.WriteString("METIS_ATLAS_USER='atlas'\n") + b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version))) + if sec != nil { + if value := effectiveAtlasPassword(sec); value != "" { + b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD=%s\n", shellQuote(value))) + } + if value := effectiveAtlasPasswordHash(sec); value != "" { + b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD_HASH=%s\n", shellQuote(value))) + } + if value := strings.TrimSpace(sec.RootPassword); value != "" { + b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD=%s\n", shellQuote(value))) + } + if value := strings.TrimSpace(sec.RootPasswordHash); value != "" { + b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD_HASH=%s\n", shellQuote(value))) + } + } + return b.String() +} + +func cloudInitRootFSContent(sec *secrets.NodeSecrets) string { + var b bytes.Buffer + b.WriteString("#cloud-config\n") + if hasNodePasswords(sec) { + b.WriteString("ssh_pwauth: true\n") + b.WriteString("disable_root: false\n") + } + b.WriteString("runcmd:\n") + b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n") + return b.String() +} + +func nodeIdentityScriptContent() string { + return `#!/usr/bin/env bash +set -euo pipefail + +marker="/var/lib/metis/node-identity-applied.done" +env_file="/etc/metis/firstboot.env" +key_file="/etc/metis/authorized_keys" +sudoers_file="/etc/metis/sudoers-hecate" +default_groups=(adm sudo tty disk dialout audio video plugdev games users systemd-journal input render netdev) + +if [ -f "${marker}" ]; then + exit 0 +fi + +mkdir -p /var/lib/metis +if [ -f "${env_file}" ]; then + # shellcheck disable=SC1090 + . "${env_file}" +fi + +atlas_user="${METIS_ATLAS_USER:-atlas}" +ssh_user="${METIS_SSH_USER:-${atlas_user}}" +atlas_password="${METIS_ATLAS_PASSWORD:-}" +atlas_password_hash="${METIS_ATLAS_PASSWORD_HASH:-}" +root_password="${METIS_ROOT_PASSWORD:-}" +root_password_hash="${METIS_ROOT_PASSWORD_HASH:-}" + +group_list=() +for group_name in "${default_groups[@]}"; do + if getent group "${group_name}" >/dev/null 2>&1; then + group_list+=("${group_name}") + fi +done +if [ "${#group_list[@]}" -gt 0 ]; then + group_csv="$(IFS=,; printf '%s' "${group_list[*]}")" +else + group_csv="" +fi + +ensure_user() { + local user_name="$1" + [ -n "${user_name}" ] || return 0 + if ! id "${user_name}" >/dev/null 2>&1; then + if [ -n "${group_csv}" ]; then + useradd -m -s /bin/bash -G "${group_csv}" "${user_name}" + else + useradd -m -s /bin/bash "${user_name}" + fi + elif [ -n "${group_csv}" ]; then + usermod -a -G "${group_csv}" "${user_name}" || true + fi +} + +apply_password() { + local user_name="$1" + local plain_password="$2" + local hash_password="$3" + if ! id "${user_name}" >/dev/null 2>&1; then + return 0 + fi + if [ -n "${hash_password}" ]; then + usermod -p "${hash_password}" "${user_name}" + passwd -u "${user_name}" >/dev/null 2>&1 || true + return 0 + fi + if [ -n "${plain_password}" ]; then + printf '%s:%s\n' "${user_name}" "${plain_password}" | chpasswd + passwd -u "${user_name}" >/dev/null 2>&1 || true + fi +} + +install_keys() { + local user_name="$1" + [ -n "${user_name}" ] || return 0 + [ -s "${key_file}" ] || return 0 + local home_dir + home_dir="$(getent passwd "${user_name}" | cut -d: -f6)" + if [ -z "${home_dir}" ]; then + if [ "${user_name}" = "root" ]; then + home_dir="/root" + else + home_dir="/home/${user_name}" + fi + fi + install -d -m 700 "${home_dir}/.ssh" + install -m 600 "${key_file}" "${home_dir}/.ssh/authorized_keys" + chown -R "${user_name}:${user_name}" "${home_dir}/.ssh" 2>/dev/null || true +} + +ensure_user "${atlas_user}" +if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then + ensure_user "${ssh_user}" +fi + +apply_password root "${root_password}" "${root_password_hash}" +apply_password "${atlas_user}" "${atlas_password}" "${atlas_password_hash}" +if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then + apply_password "${ssh_user}" "${atlas_password}" "${atlas_password_hash}" +fi + +if [ -s "${key_file}" ]; then + install_keys root + install_keys "${atlas_user}" + if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then + install_keys "${ssh_user}" + fi +fi + +if [ -s "${sudoers_file}" ]; then + install -d -m 755 /etc/sudoers.d + install -m 440 "${sudoers_file}" /etc/sudoers.d/90-hecate-atlas + if command -v visudo >/dev/null 2>&1; then + visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null 2>&1 || rm -f /etc/sudoers.d/90-hecate-atlas + fi +fi + +systemctl restart ssh.service >/dev/null 2>&1 || systemctl restart sshd.service >/dev/null 2>&1 || systemctl restart ssh.socket >/dev/null 2>&1 || true +touch "${marker}" +` +} + +func sshPasswordConfigContent(sec *secrets.NodeSecrets) string { + if !hasNodePasswords(sec) { + return "" + } + return "PasswordAuthentication yes\nKbdInteractiveAuthentication no\nChallengeResponseAuthentication no\nPermitRootLogin yes\nUsePAM yes\n" +} + +func hasNodePasswords(sec *secrets.NodeSecrets) bool { + if sec == nil { + return false + } + return effectiveAtlasPassword(sec) != "" || effectiveAtlasPasswordHash(sec) != "" || firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "" +} + +func effectiveAtlasPassword(sec *secrets.NodeSecrets) string { + if sec == nil { + return "" + } + return firstNonEmptyString(sec.AtlasPassword, sec.SSHPassword) +} + +func effectiveAtlasPasswordHash(sec *secrets.NodeSecrets) string { + if sec == nil { + return "" + } + return firstNonEmptyString(sec.AtlasPasswordHash, sec.SSHPasswordHash) +} + +func firstNonEmptyString(values ...string) string { + for _, value := range values { + if trimmed := strings.TrimSpace(value); trimmed != "" { + return trimmed + } + } + return "" +} + +func redactedSecretsForImage(sec *secrets.NodeSecrets) map[string]any { + if sec == nil { + return nil + } + debug := map[string]any{ + "has_ssh_password": firstNonEmptyString(sec.SSHPassword, sec.SSHPasswordHash) != "", + "has_atlas_password": firstNonEmptyString(sec.AtlasPassword, sec.AtlasPasswordHash) != "", + "has_root_password": firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "", + "has_k3s_token": strings.TrimSpace(sec.K3sToken) != "", + "has_cloud_init_override": strings.TrimSpace(sec.CloudInit) != "", + } + if len(sec.Extra) > 0 { + keys := make([]string, 0, len(sec.Extra)) + for key := range sec.Extra { + key = strings.TrimSpace(key) + if key == "" { + continue + } + keys = append(keys, key) + } + sort.Strings(keys) + debug["extra_keys"] = keys + } + return debug +} + +func shellQuote(value string) string { + if value == "" { + return "''" + } + return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'" +} diff --git a/pkg/plan/node_metadata.go b/pkg/plan/node_metadata.go new file mode 100644 index 0000000..29bd172 --- /dev/null +++ b/pkg/plan/node_metadata.go @@ -0,0 +1,133 @@ +package plan + +import ( + "context" + "encoding/json" + "os" + "sort" + "strings" + + "metis/pkg/config" + "metis/pkg/secrets" +) + +func fetchSecrets(hostname string) *secrets.NodeSecrets { + envSecrets := nodeSecretsFromEnv() + if os.Getenv("VAULT_ADDR") == "" { + return envSecrets + } + cli := secrets.NewFromEnv() + sec, err := cli.FetchNode(context.Background(), hostname) + if err != nil { + return envSecrets + } + return mergeNodeSecrets(sec, envSecrets) +} + +func nodeSecretsFromEnv() *secrets.NodeSecrets { + sec := &secrets.NodeSecrets{ + SSHPassword: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD")), + SSHPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD_HASH")), + AtlasPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD")), + AtlasPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD_HASH")), + RootPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD")), + RootPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD_HASH")), + } + if sec.SSHPassword == "" && sec.SSHPasswordHash == "" && sec.AtlasPassword == "" && sec.AtlasPasswordHash == "" && sec.RootPassword == "" && sec.RootPasswordHash == "" { + return nil + } + return sec +} + +func mergeNodeSecrets(base, override *secrets.NodeSecrets) *secrets.NodeSecrets { + if base == nil { + return override + } + if override == nil { + return base + } + merged := *base + merged.SSHPassword = firstNonEmptyString(override.SSHPassword, base.SSHPassword) + merged.SSHPasswordHash = firstNonEmptyString(override.SSHPasswordHash, base.SSHPasswordHash) + merged.AtlasPassword = firstNonEmptyString(override.AtlasPassword, base.AtlasPassword) + merged.AtlasPasswordHash = firstNonEmptyString(override.AtlasPasswordHash, base.AtlasPasswordHash) + merged.RootPassword = firstNonEmptyString(override.RootPassword, base.RootPassword) + merged.RootPasswordHash = firstNonEmptyString(override.RootPasswordHash, base.RootPasswordHash) + merged.K3sToken = firstNonEmptyString(override.K3sToken, base.K3sToken) + merged.CloudInit = firstNonEmptyString(override.CloudInit, base.CloudInit) + if len(base.Extra) > 0 || len(override.Extra) > 0 { + merged.Extra = map[string]string{} + for key, value := range base.Extra { + merged.Extra[key] = value + } + for key, value := range override.Extra { + merged.Extra[key] = value + } + } + return &merged +} + +func applyNodeMetadataEnv(cfg *config.NodeConfig) { + if cfg == nil { + return + } + if labels := parseEnvJSONMap(os.Getenv("METIS_NODE_LABELS_JSON")); len(labels) > 0 { + if cfg.Labels == nil { + cfg.Labels = map[string]string{} + } + for key, value := range labels { + cfg.Labels[key] = value + } + cfg.K3s.Labels = cfg.Labels + } + if taints := parseEnvJSONList(os.Getenv("METIS_NODE_TAINTS_JSON")); len(taints) > 0 { + cfg.Taints = uniqueStrings(append(cfg.Taints, taints...)) + cfg.K3s.Taints = append([]string{}, cfg.Taints...) + } +} + +func parseEnvJSONMap(raw string) map[string]string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + var values map[string]string + if err := json.Unmarshal([]byte(raw), &values); err != nil { + return nil + } + return values +} + +func parseEnvJSONList(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + var values []string + if err := json.Unmarshal([]byte(raw), &values); err != nil { + return nil + } + return values +} + +func uniqueStrings(values []string) []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(values)) + for _, value := range values { + value = strings.TrimSpace(value) + if value == "" { + continue + } + if _, ok := seen[value]; ok { + continue + } + seen[value] = struct{}{} + out = append(out, value) + } + sort.Strings(out) + return out +} + +func jsonMarshalIndent(value any) ([]byte, error) { + return json.MarshalIndent(value, "", " ") +} diff --git a/pkg/plan/node_secrets_test.go b/pkg/plan/node_secrets_test.go new file mode 100644 index 0000000..9772c3e --- /dev/null +++ b/pkg/plan/node_secrets_test.go @@ -0,0 +1,127 @@ +package plan + +import ( + "reflect" + "strings" + "testing" + + "metis/pkg/config" + "metis/pkg/secrets" +) + +func TestNodeSecretHelpers(t *testing.T) { + if got := effectiveAtlasPassword(nil); got != "" { + t.Fatalf("effectiveAtlasPassword(nil) = %q", got) + } + if got := effectiveAtlasPasswordHash(nil); got != "" { + t.Fatalf("effectiveAtlasPasswordHash(nil) = %q", got) + } + sec := &secrets.NodeSecrets{SSHPassword: "ssh-pass", SSHPasswordHash: "$ssh$hash"} + if got := effectiveAtlasPassword(sec); got != "ssh-pass" { + t.Fatalf("effectiveAtlasPassword fallback = %q", got) + } + if got := effectiveAtlasPasswordHash(sec); got != "$ssh$hash" { + t.Fatalf("effectiveAtlasPasswordHash fallback = %q", got) + } + sec.AtlasPassword = "atlas-pass" + sec.AtlasPasswordHash = "$atlas$hash" + if got := effectiveAtlasPassword(sec); got != "atlas-pass" { + t.Fatalf("effectiveAtlasPassword explicit = %q", got) + } + if got := effectiveAtlasPasswordHash(sec); got != "$atlas$hash" { + t.Fatalf("effectiveAtlasPasswordHash explicit = %q", got) + } + if got := firstNonEmptyString("", " value ", "ignored"); got != "value" { + t.Fatalf("firstNonEmptyString = %q", got) + } + if !hasNodePasswords(&secrets.NodeSecrets{RootPasswordHash: "$root$hash"}) { + t.Fatal("expected root password hash to count as password material") + } + if hasNodePasswords(&secrets.NodeSecrets{}) { + t.Fatal("empty node secrets should not count as password material") + } + debug := redactedSecretsForImage(&secrets.NodeSecrets{Extra: map[string]string{"b": "2", "a": "1"}}) + if !reflect.DeepEqual(debug["extra_keys"], []string{"a", "b"}) { + t.Fatalf("redactedSecretsForImage extra_keys = %#v", debug) + } +} + +func TestNodeSecretsFromEnvAndMergeNodeSecrets(t *testing.T) { + t.Setenv("METIS_NODE_SSH_PASSWORD", "ssh-pass") + t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "$ssh$hash") + t.Setenv("METIS_NODE_ATLAS_PASSWORD", "atlas-pass") + t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "$atlas$hash") + t.Setenv("METIS_NODE_ROOT_PASSWORD", "root-pass") + t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "$root$hash") + envSecrets := nodeSecretsFromEnv() + if envSecrets == nil || envSecrets.RootPassword != "root-pass" || envSecrets.AtlasPasswordHash != "$atlas$hash" { + t.Fatalf("nodeSecretsFromEnv = %#v", envSecrets) + } + merged := mergeNodeSecrets(&secrets.NodeSecrets{ + SSHPassword: "base-ssh", + K3sToken: "base-token", + CloudInit: "base-cloud", + Extra: map[string]string{"base": "1"}, + }, &secrets.NodeSecrets{ + AtlasPassword: "override-atlas", + RootPassword: "override-root", + K3sToken: "override-token", + CloudInit: "override-cloud", + Extra: map[string]string{"override": "2"}, + }) + if merged.K3sToken != "override-token" || merged.CloudInit != "override-cloud" || merged.AtlasPassword != "override-atlas" || merged.RootPassword != "override-root" { + t.Fatalf("mergeNodeSecrets = %#v", merged) + } + if merged.Extra["base"] != "1" || merged.Extra["override"] != "2" { + t.Fatalf("mergeNodeSecrets extras = %#v", merged.Extra) + } + if got := mergeNodeSecrets(nil, envSecrets); got.RootPasswordHash != "$root$hash" { + t.Fatalf("mergeNodeSecrets nil base = %#v", got) + } + if got := mergeNodeSecrets(envSecrets, nil); got.SSHPassword != "ssh-pass" { + t.Fatalf("mergeNodeSecrets nil override = %#v", got) + } + t.Setenv("METIS_NODE_SSH_PASSWORD", "") + t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "") + t.Setenv("METIS_NODE_ATLAS_PASSWORD", "") + t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "") + t.Setenv("METIS_NODE_ROOT_PASSWORD", "") + t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "") + if got := nodeSecretsFromEnv(); got != nil { + t.Fatalf("expected empty env secrets to collapse to nil, got %#v", got) + } +} + +func TestFirstbootEnvContentIncludesHashes(t *testing.T) { + cfg := &config.NodeConfig{ + Hostname: "titan-15", + SSHUser: "atlas", + K3s: config.K3sConfig{Version: "v1.31.5+k3s1"}, + } + content := firstbootEnvContent(cfg, &secrets.NodeSecrets{ + AtlasPasswordHash: "$atlas$hash", + RootPasswordHash: "$root$hash", + }) + if !reflect.DeepEqual(parseEnvLines(content), map[string]string{ + "METIS_HOSTNAME": "'titan-15'", + "METIS_SSH_USER": "'atlas'", + "METIS_ATLAS_USER": "'atlas'", + "METIS_K3S_VERSION": "'v1.31.5+k3s1'", + "METIS_ATLAS_PASSWORD_HASH": "'$atlas$hash'", + "METIS_ROOT_PASSWORD_HASH": "'$root$hash'", + }) { + t.Fatalf("firstbootEnvContent = %q", content) + } +} + +func parseEnvLines(raw string) map[string]string { + result := map[string]string{} + for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { + parts := strings.SplitN(line, "=", 2) + if len(parts) != 2 { + continue + } + result[parts[0]] = parts[1] + } + return result +} diff --git a/pkg/secrets/vault.go b/pkg/secrets/vault.go index 2028f82..5653681 100644 --- a/pkg/secrets/vault.go +++ b/pkg/secrets/vault.go @@ -15,10 +15,15 @@ import ( // NodeSecrets holds per-node secret material to inject at burn time. // These should live in Vault at secret/data/nodes/. type NodeSecrets struct { - SSHPassword string `json:"ssh_password,omitempty"` - K3sToken string `json:"k3s_token,omitempty"` - CloudInit string `json:"cloud_init,omitempty"` - Extra map[string]string `json:"extra,omitempty"` + SSHPassword string `json:"ssh_password,omitempty"` + SSHPasswordHash string `json:"ssh_password_hash,omitempty"` + AtlasPassword string `json:"atlas_password,omitempty"` + AtlasPasswordHash string `json:"atlas_password_hash,omitempty"` + RootPassword string `json:"root_password,omitempty"` + RootPasswordHash string `json:"root_password_hash,omitempty"` + K3sToken string `json:"k3s_token,omitempty"` + CloudInit string `json:"cloud_init,omitempty"` + Extra map[string]string `json:"extra,omitempty"` } // Client fetches node secrets from Vault using either a token or AppRole. diff --git a/pkg/secrets/vault_test.go b/pkg/secrets/vault_test.go index 9b8ed4d..deb2cf1 100644 --- a/pkg/secrets/vault_test.go +++ b/pkg/secrets/vault_test.go @@ -16,9 +16,11 @@ func TestFetchNodeReturnsData(t *testing.T) { _ = json.NewEncoder(w).Encode(map[string]any{ "data": map[string]any{ "data": map[string]any{ - "ssh_password": "p1", - "k3s_token": "t1", - "cloud_init": "ci", + "ssh_password": "p1", + "atlas_password_hash": "$atlas$hash", + "root_password": "root-pw", + "k3s_token": "t1", + "cloud_init": "ci", }, }, }) @@ -33,7 +35,7 @@ func TestFetchNodeReturnsData(t *testing.T) { if err != nil { t.Fatalf("fetch: %v", err) } - if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" { + if sec.SSHPassword != "p1" || sec.AtlasPasswordHash != "$atlas$hash" || sec.RootPassword != "root-pw" || sec.K3sToken != "t1" || sec.CloudInit != "ci" { t.Fatalf("unexpected secrets: %+v", sec) } } diff --git a/pkg/service/app.go b/pkg/service/app.go index c5c6652..f45ea44 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -117,16 +117,24 @@ type App struct { inventory *inventory.Inventory metrics *Metrics - mu sync.RWMutex - jobs map[string]*Job - snapshots map[string]SnapshotRecord - targets map[string]facts.Targets - artifactStore map[string]ArtifactSummary - deviceStore map[string]deviceSnapshot + mu sync.RWMutex + jobs map[string]*Job + snapshots map[string]SnapshotRecord + targets map[string]facts.Targets + artifactStore map[string]ArtifactSummary + deviceStore map[string]deviceSnapshot + desiredMetadata map[string]DesiredNodeMetadata } // NewApp creates a Metis service app instance. func NewApp(settings Settings) (*App, error) { + if strings.TrimSpace(settings.DesiredMetadataPath) == "" { + baseDir := filepath.Dir(settings.SnapshotsPath) + if strings.TrimSpace(baseDir) == "" || baseDir == "." { + baseDir = filepath.Dir(settings.HistoryPath) + } + settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json") + } if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil { return nil, err } @@ -141,18 +149,20 @@ func NewApp(settings Settings) (*App, error) { return nil, err } app := &App{ - settings: settings, - inventory: inv, - metrics: NewMetrics(), - jobs: map[string]*Job{}, - snapshots: map[string]SnapshotRecord{}, - targets: map[string]facts.Targets{}, - artifactStore: map[string]ArtifactSummary{}, - deviceStore: map[string]deviceSnapshot{}, + settings: settings, + inventory: inv, + metrics: NewMetrics(), + jobs: map[string]*Job{}, + snapshots: map[string]SnapshotRecord{}, + targets: map[string]facts.Targets{}, + artifactStore: map[string]ArtifactSummary{}, + deviceStore: map[string]deviceSnapshot{}, + desiredMetadata: map[string]DesiredNodeMetadata{}, } _ = app.loadSnapshots() _ = app.loadTargets() _ = app.loadArtifacts() + _ = app.loadDesiredNodeMetadata() return app, nil } @@ -289,6 +299,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error { if err := a.syncScratchAnnotations(record); err != nil { a.appendEvent(annotationSyncEvent(record.Node, err)) } + if err := a.syncDesiredNodeMetadata(record); err != nil { + a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err)) + } a.appendEvent(Event{ Time: record.CollectedAt, Kind: "sentinel.snapshot", diff --git a/pkg/service/cluster.go b/pkg/service/cluster.go index 38fca25..b3d5e47 100644 --- a/pkg/service/cluster.go +++ b/pkg/service/cluster.go @@ -22,6 +22,9 @@ type clusterNode struct { Worker bool ControlPlane bool Unschedulable bool + Labels map[string]string + Annotations map[string]string + Taints []string USBScratchStatus string USBScratchManagedPaths string } @@ -179,6 +182,11 @@ func clusterNodes() []clusterNode { } `json:"metadata"` Spec struct { Unschedulable bool `json:"unschedulable"` + Taints []struct { + Key string `json:"key"` + Value string `json:"value"` + Effect string `json:"effect"` + } `json:"taints"` } `json:"spec"` } `json:"items"` } @@ -189,6 +197,28 @@ func clusterNodes() []clusterNode { for _, item := range payload.Items { labels := item.Metadata.Labels annotations := item.Metadata.Annotations + if labels == nil { + labels = map[string]string{} + } + if annotations == nil { + annotations = map[string]string{} + } + taints := make([]string, 0, len(item.Spec.Taints)) + for _, taint := range item.Spec.Taints { + key := strings.TrimSpace(taint.Key) + if key == "" { + continue + } + raw := key + if value := strings.TrimSpace(taint.Value); value != "" { + raw += "=" + value + } + if effect := strings.TrimSpace(taint.Effect); effect != "" { + raw += ":" + effect + } + taints = append(taints, raw) + } + sort.Strings(taints) nodes = append(nodes, clusterNode{ Name: strings.TrimSpace(item.Metadata.Name), Arch: strings.TrimSpace(labels["kubernetes.io/arch"]), @@ -196,6 +226,9 @@ func clusterNodes() []clusterNode { Worker: labels["node-role.kubernetes.io/worker"] == "true", ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "", Unschedulable: item.Spec.Unschedulable, + Labels: labels, + Annotations: annotations, + Taints: taints, USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]), USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]), }) diff --git a/pkg/service/coverage_more_test.go b/pkg/service/coverage_more_test.go index 1b757b2..1d5fb1a 100644 --- a/pkg/service/coverage_more_test.go +++ b/pkg/service/coverage_more_test.go @@ -37,6 +37,11 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) { t.Fatal("expected persistTargets to fail when parent is a file") } + app.settings.DesiredMetadataPath = filepath.Join(fileParent, "desired-node-metadata.json") + if err := app.persistDesiredNodeMetadata(); err == nil { + t.Fatal("expected persistDesiredNodeMetadata to fail when parent is a file") + } + invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json") if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil { t.Fatal(err) @@ -45,6 +50,15 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) { if err := app.loadArtifacts(); err == nil { t.Fatal("expected loadArtifacts to reject invalid json") } + + invalidDesiredState := filepath.Join(t.TempDir(), "desired-node-metadata.json") + if err := os.WriteFile(invalidDesiredState, []byte("{bad-json"), 0o644); err != nil { + t.Fatal(err) + } + app.settings.DesiredMetadataPath = invalidDesiredState + if err := app.loadDesiredNodeMetadata(); err == nil { + t.Fatal("expected loadDesiredNodeMetadata to reject invalid json") + } } func TestServiceReplacementAndDeviceBranches(t *testing.T) { diff --git a/pkg/service/helpers_test.go b/pkg/service/helpers_test.go index 9714e8d..b798ac2 100644 --- a/pkg/service/helpers_test.go +++ b/pkg/service/helpers_test.go @@ -162,6 +162,7 @@ nodes: snapshotsPath := filepath.Join(dir, "snapshots.json") targetsPath := filepath.Join(dir, "targets.json") artifactStatePath := filepath.Join(dir, "artifacts.json") + desiredMetadataPath := filepath.Join(dir, "desired-node-metadata.json") seedSnapshots := map[string]SnapshotRecord{ "titan-15": { @@ -190,19 +191,33 @@ nodes: if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil { t.Fatal(err) } + seedDesiredMetadata := map[string]DesiredNodeMetadata{ + "titan-15": { + Node: "titan-15", + Hostname: "titan-15", + CapturedAt: testTime(t), + Labels: map[string]string{"hardware": "rpi5"}, + Taints: []string{"dedicated=recovery:NoSchedule"}, + }, + } + data, _ = json.MarshalIndent(seedDesiredMetadata, "", " ") + if err := os.WriteFile(desiredMetadataPath, data, 0o644); err != nil { + t.Fatal(err) + } app, err := NewApp(Settings{ - InventoryPath: invPath, - CacheDir: filepath.Join(dir, "cache"), - ArtifactDir: filepath.Join(dir, "artifacts"), - ArtifactStatePath: artifactStatePath, - HistoryPath: filepath.Join(dir, "history.jsonl"), - SnapshotsPath: snapshotsPath, - TargetsPath: targetsPath, - DefaultFlashHost: "titan-22", - FlashHosts: []string{"titan-22"}, - LocalHost: "titan-22", - AllowedGroups: []string{"admin"}, + InventoryPath: invPath, + CacheDir: filepath.Join(dir, "cache"), + ArtifactDir: filepath.Join(dir, "artifacts"), + ArtifactStatePath: artifactStatePath, + HistoryPath: filepath.Join(dir, "history.jsonl"), + SnapshotsPath: snapshotsPath, + TargetsPath: targetsPath, + DesiredMetadataPath: desiredMetadataPath, + DefaultFlashHost: "titan-22", + FlashHosts: []string{"titan-22"}, + LocalHost: "titan-22", + AllowedGroups: []string{"admin"}, }) if err != nil { t.Fatalf("NewApp: %v", err) @@ -211,6 +226,9 @@ nodes: if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" { t.Fatalf("artifacts() = %q", got) } + if desired, ok := app.desiredMetadataForNode("titan-15"); !ok || desired.Labels["hardware"] != "rpi5" { + t.Fatalf("desiredMetadataForNode() = %#v ok=%v", desired, ok) + } if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil { t.Fatalf("recordArtifact: %v", err) } diff --git a/pkg/service/node_recovery.go b/pkg/service/node_recovery.go new file mode 100644 index 0000000..8d9fd87 --- /dev/null +++ b/pkg/service/node_recovery.go @@ -0,0 +1,483 @@ +package service + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "metis/pkg/config" +) + +// DesiredNodeMetadata captures the node identity Metis should preserve through +// recovery builds and re-assert after the node rejoins the cluster. +type DesiredNodeMetadata struct { + Node string `json:"node"` + Hostname string `json:"hostname,omitempty"` + CapturedAt time.Time `json:"captured_at,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` + Taints []string `json:"taints,omitempty"` + Unschedulable bool `json:"unschedulable,omitempty"` +} + +func (a *App) loadDesiredNodeMetadata() error { + data, err := os.ReadFile(a.settings.DesiredMetadataPath) + if err != nil { + return err + } + var desired map[string]DesiredNodeMetadata + if err := json.Unmarshal(data, &desired); err != nil { + return err + } + a.mu.Lock() + a.desiredMetadata = desired + a.mu.Unlock() + return nil +} + +func (a *App) persistDesiredNodeMetadata() error { + a.mu.RLock() + data, err := json.MarshalIndent(a.desiredMetadata, "", " ") + a.mu.RUnlock() + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(a.settings.DesiredMetadataPath), 0o755); err != nil { + return err + } + return os.WriteFile(a.settings.DesiredMetadataPath, data, 0o644) +} + +func (a *App) desiredMetadataForNode(node string) (DesiredNodeMetadata, bool) { + node = strings.TrimSpace(node) + if node == "" { + return DesiredNodeMetadata{}, false + } + a.mu.RLock() + defer a.mu.RUnlock() + desired, ok := a.desiredMetadata[node] + if !ok { + return DesiredNodeMetadata{}, false + } + return cloneDesiredNodeMetadata(desired), true +} + +func (a *App) stageDesiredNodeMetadata(nodeName string) (DesiredNodeMetadata, error) { + nodeName = strings.TrimSpace(nodeName) + if nodeName == "" { + return DesiredNodeMetadata{}, fmt.Errorf("node metadata requires a node name") + } + nodeSpec, _, err := a.inventory.FindNode(nodeName) + if err != nil { + return DesiredNodeMetadata{}, err + } + cfg, err := config.Build(a.inventory, nodeName) + if err != nil { + return DesiredNodeMetadata{}, err + } + desired := DesiredNodeMetadata{ + Node: nodeName, + Hostname: strings.TrimSpace(nodeSpec.Hostname), + CapturedAt: time.Now().UTC(), + Labels: filteredRestorableLabels(cfg.Labels), + Taints: restorableTaints(cfg.Taints), + } + if existing, ok := a.desiredMetadataForNode(nodeName); ok { + desired = mergeDesiredNodeMetadata(desired, existing) + } + if live, ok := liveClusterNode(nodeName); ok { + desired = mergeDesiredNodeMetadata(desired, desiredMetadataFromCluster(*live)) + } + desired.Labels = normalizeStringMap(desired.Labels) + desired.Annotations = normalizeStringMap(desired.Annotations) + desired.Taints = normalizeTaints(desired.Taints) + a.mu.Lock() + if a.desiredMetadata == nil { + a.desiredMetadata = map[string]DesiredNodeMetadata{} + } + a.desiredMetadata[nodeName] = desired + a.mu.Unlock() + if err := a.persistDesiredNodeMetadata(); err != nil { + return DesiredNodeMetadata{}, err + } + return cloneDesiredNodeMetadata(desired), nil +} + +func (a *App) syncDesiredNodeMetadata(record SnapshotRecord) error { + desired, ok := a.desiredMetadataForNode(record.Node) + if !ok { + return nil + } + live, ok := liveClusterNode(record.Node) + if !ok { + return nil + } + return patchDesiredNodeMetadata(*live, desired) +} + +func desiredMetadataFromCluster(node clusterNode) DesiredNodeMetadata { + return DesiredNodeMetadata{ + Node: strings.TrimSpace(node.Name), + Labels: filteredRestorableLabels(node.Labels), + Annotations: filteredRestorableAnnotations(node.Annotations), + Taints: restorableTaints(node.Taints), + Unschedulable: node.Unschedulable, + } +} + +func mergeDesiredNodeMetadata(base, overlay DesiredNodeMetadata) DesiredNodeMetadata { + merged := cloneDesiredNodeMetadata(base) + if hostname := strings.TrimSpace(overlay.Hostname); hostname != "" { + merged.Hostname = hostname + } + if !overlay.CapturedAt.IsZero() { + merged.CapturedAt = overlay.CapturedAt + } + if merged.Labels == nil { + merged.Labels = map[string]string{} + } + for key, value := range overlay.Labels { + if key = strings.TrimSpace(key); key == "" { + continue + } + merged.Labels[key] = strings.TrimSpace(value) + } + if merged.Annotations == nil { + merged.Annotations = map[string]string{} + } + for key, value := range overlay.Annotations { + if key = strings.TrimSpace(key); key == "" { + continue + } + merged.Annotations[key] = strings.TrimSpace(value) + } + if len(overlay.Taints) > 0 { + merged.Taints = normalizeTaints(overlay.Taints) + } + merged.Unschedulable = overlay.Unschedulable + return merged +} + +func patchDesiredNodeMetadata(live clusterNode, desired DesiredNodeMetadata) error { + node := strings.TrimSpace(desired.Node) + if node == "" { + node = strings.TrimSpace(live.Name) + } + if node == "" { + return nil + } + labelPatch := metadataStringPatch(live.Labels, desired.Labels, isRestorableLabel) + annotationPatch := metadataStringPatch(live.Annotations, desired.Annotations, isRestorableAnnotation) + mergedTaints := mergeLiveAndDesiredTaints(live.Taints, desired.Taints) + body := map[string]any{} + metadata := map[string]any{} + if len(labelPatch) > 0 { + metadata["labels"] = labelPatch + } + if len(annotationPatch) > 0 { + metadata["annotations"] = annotationPatch + } + if len(metadata) > 0 { + body["metadata"] = metadata + } + spec := map[string]any{} + if live.Unschedulable != desired.Unschedulable { + spec["unschedulable"] = desired.Unschedulable + } + if !sameTaints(live.Taints, mergedTaints) { + spec["taints"] = taintPatchPayload(mergedTaints) + } + if len(spec) > 0 { + body["spec"] = spec + } + if len(body) == 0 { + return nil + } + kube, err := kubeClientFactory() + if err != nil { + return err + } + return kube.mergePatch("/api/v1/nodes/"+node, body) +} + +func metadataStringPatch(live, desired map[string]string, allow func(string) bool) map[string]any { + patch := map[string]any{} + for key, value := range desired { + key = strings.TrimSpace(key) + if key == "" || !allow(key) { + continue + } + value = strings.TrimSpace(value) + if strings.TrimSpace(live[key]) != value { + patch[key] = value + } + } + for key := range live { + key = strings.TrimSpace(key) + if key == "" || !allow(key) { + continue + } + if _, ok := desired[key]; !ok { + patch[key] = nil + } + } + return patch +} + +func liveClusterNode(node string) (*clusterNode, bool) { + node = strings.TrimSpace(node) + if node == "" { + return nil, false + } + for _, live := range clusterNodes() { + if strings.TrimSpace(live.Name) == node { + copyNode := live + return ©Node, true + } + } + return nil, false +} + +func filteredRestorableLabels(values map[string]string) map[string]string { + filtered := map[string]string{} + for key, value := range values { + key = strings.TrimSpace(key) + if key == "" || !isRestorableLabel(key) { + continue + } + filtered[key] = strings.TrimSpace(value) + } + return filtered +} + +func filteredRestorableAnnotations(values map[string]string) map[string]string { + filtered := map[string]string{} + for key, value := range values { + key = strings.TrimSpace(key) + if key == "" || !isRestorableAnnotation(key) { + continue + } + filtered[key] = strings.TrimSpace(value) + } + return filtered +} + +func normalizeStringMap(values map[string]string) map[string]string { + if len(values) == 0 { + return nil + } + normalized := map[string]string{} + for key, value := range values { + key = strings.TrimSpace(key) + if key == "" { + continue + } + normalized[key] = strings.TrimSpace(value) + } + if len(normalized) == 0 { + return nil + } + return normalized +} + +func restorableTaints(values []string) []string { + filtered := make([]string, 0, len(values)) + for _, value := range values { + value = normalizeTaint(value) + if value == "" || !isRestorableTaint(value) { + continue + } + filtered = append(filtered, value) + } + return normalizeTaints(filtered) +} + +func normalizeTaints(values []string) []string { + if len(values) == 0 { + return nil + } + seen := map[string]struct{}{} + out := make([]string, 0, len(values)) + for _, value := range values { + value = normalizeTaint(value) + if value == "" { + continue + } + if _, ok := seen[value]; ok { + continue + } + seen[value] = struct{}{} + out = append(out, value) + } + sort.Strings(out) + if len(out) == 0 { + return nil + } + return out +} + +func normalizeTaint(value string) string { + return strings.TrimSpace(value) +} + +func sameTaints(left, right []string) bool { + left = normalizeTaints(left) + right = normalizeTaints(right) + if len(left) != len(right) { + return false + } + for idx := range left { + if left[idx] != right[idx] { + return false + } + } + return true +} + +func mergeLiveAndDesiredTaints(live, desired []string) []string { + merged := make([]string, 0, len(live)+len(desired)) + for _, taint := range live { + taint = normalizeTaint(taint) + if taint == "" || isRestorableTaint(taint) { + continue + } + merged = append(merged, taint) + } + merged = append(merged, restorableTaints(desired)...) + return normalizeTaints(merged) +} + +func taintPatchPayload(values []string) []map[string]string { + payload := make([]map[string]string, 0, len(values)) + for _, value := range normalizeTaints(values) { + key, taintValue, effect := splitTaint(value) + if key == "" { + continue + } + entry := map[string]string{"key": key} + if taintValue != "" { + entry["value"] = taintValue + } + if effect != "" { + entry["effect"] = effect + } + payload = append(payload, entry) + } + return payload +} + +func splitTaint(raw string) (string, string, string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", "", "" + } + effect := "" + body := raw + if idx := strings.LastIndex(raw, ":"); idx >= 0 { + body = strings.TrimSpace(raw[:idx]) + effect = strings.TrimSpace(raw[idx+1:]) + } + key := body + value := "" + if idx := strings.Index(body, "="); idx >= 0 { + key = strings.TrimSpace(body[:idx]) + value = strings.TrimSpace(body[idx+1:]) + } + return strings.TrimSpace(key), value, effect +} + +func isRestorableTaint(raw string) bool { + key, _, _ := splitTaint(raw) + if key == "" { + return false + } + for _, prefix := range []string{ + "node.kubernetes.io/", + "node.cloudprovider.kubernetes.io/", + "ToBeDeletedByClusterAutoscaler", + } { + if strings.HasPrefix(key, prefix) { + return false + } + } + return true +} + +func isRestorableLabel(key string) bool { + key = strings.TrimSpace(key) + if key == "" { + return false + } + if strings.HasPrefix(key, "node-role.kubernetes.io/") { + return true + } + for _, prefix := range []string{ + "kubernetes.io/", + "beta.kubernetes.io/", + "node.kubernetes.io/", + "topology.kubernetes.io/", + "feature.node.kubernetes.io/", + "failure-domain.beta.kubernetes.io/", + "nvidia.com/", + "k3s.io/", + "rke2.io/", + "volumes.kubernetes.io/", + "node.cloudprovider.kubernetes.io/", + } { + if strings.HasPrefix(key, prefix) { + return false + } + } + return true +} + +func isRestorableAnnotation(key string) bool { + key = strings.TrimSpace(key) + if key == "" { + return false + } + for _, prefix := range []string{ + "kubectl.kubernetes.io/", + "kubeadm.alpha.kubernetes.io/", + "kubernetes.io/", + "node.alpha.kubernetes.io/", + "node.kubernetes.io/", + "volumes.kubernetes.io/", + "csi.volume.kubernetes.io/", + "csi.storage.k8s.io/", + "flannel.alpha.coreos.com/", + "projectcalico.org/", + "rke2.io/", + "k3s.io/", + "nvidia.com/", + } { + if strings.HasPrefix(key, prefix) { + return false + } + } + return true +} + +func cloneDesiredNodeMetadata(value DesiredNodeMetadata) DesiredNodeMetadata { + clone := value + clone.Labels = normalizeStringMap(value.Labels) + clone.Annotations = normalizeStringMap(value.Annotations) + clone.Taints = normalizeTaints(value.Taints) + return clone +} + +func desiredNodeMetadataSyncEvent(node string, err error) Event { + return Event{ + Time: time.Now().UTC(), + Kind: "sentinel.node-metadata", + Summary: fmt.Sprintf("Could not restore desired node metadata for %s", node), + Details: map[string]any{ + "node": node, + "error": err.Error(), + }, + } +} diff --git a/pkg/service/node_recovery_test.go b/pkg/service/node_recovery_test.go new file mode 100644 index 0000000..e0b92fd --- /dev/null +++ b/pkg/service/node_recovery_test.go @@ -0,0 +1,254 @@ +package service + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "reflect" + "strings" + "testing" + "time" + + "metis/pkg/sentinel" +) + +func TestStageDesiredNodeMetadataMergesInventoryAndLiveCluster(t *testing.T) { + kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes": + _ = json.NewEncoder(w).Encode(map[string]any{ + "items": []any{ + map[string]any{ + "metadata": map[string]any{ + "name": "titan-15", + "labels": map[string]string{ + "hardware": "rpi5", + "rack": "a1", + "maintenance.bstein.dev/color": "blue", + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true", + }, + "annotations": map[string]string{ + "maintenance.bstein.dev/owner": "atlas", + "volumes.kubernetes.io/controller-managed-attach-detach": "true", + }, + }, + "spec": map[string]any{ + "unschedulable": true, + "taints": []any{ + map[string]any{"key": "dedicated", "value": "recovery", "effect": "NoSchedule"}, + map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"}, + }, + }, + }, + }, + }) + default: + http.NotFound(w, r) + } + })) + defer kube.Close() + installKubeFactory(t, kube) + + app := newTestApp(t) + app.inventory.Nodes[0].Labels = map[string]string{"hardware": "rpi4", "rack": "a1"} + app.inventory.Nodes[0].Taints = []string{"flash=true:NoSchedule"} + app.desiredMetadata["titan-15"] = DesiredNodeMetadata{ + Node: "titan-15", + Annotations: map[string]string{"maintenance.bstein.dev/legacy": "keep"}, + } + + desired, err := app.stageDesiredNodeMetadata("titan-15") + if err != nil { + t.Fatalf("stageDesiredNodeMetadata: %v", err) + } + if desired.Hostname != "titan-15" || !desired.Unschedulable { + t.Fatalf("unexpected desired metadata header: %#v", desired) + } + if desired.Labels["hardware"] != "rpi5" || desired.Labels["rack"] != "a1" || desired.Labels["maintenance.bstein.dev/color"] != "blue" { + t.Fatalf("unexpected desired labels: %#v", desired.Labels) + } + if _, ok := desired.Labels["kubernetes.io/arch"]; ok { + t.Fatalf("system labels should not be persisted: %#v", desired.Labels) + } + if desired.Annotations["maintenance.bstein.dev/owner"] != "atlas" || desired.Annotations["maintenance.bstein.dev/legacy"] != "keep" { + t.Fatalf("unexpected desired annotations: %#v", desired.Annotations) + } + if _, ok := desired.Annotations["volumes.kubernetes.io/controller-managed-attach-detach"]; ok { + t.Fatalf("controller annotations should not be persisted: %#v", desired.Annotations) + } + if !reflect.DeepEqual(desired.Taints, []string{"dedicated=recovery:NoSchedule"}) { + t.Fatalf("unexpected desired taints: %#v", desired.Taints) + } + data, err := os.ReadFile(app.settings.DesiredMetadataPath) + if err != nil { + t.Fatalf("read desired metadata file: %v", err) + } + if !strings.Contains(string(data), "titan-15") { + t.Fatalf("desired metadata file missing titan-15: %s", string(data)) + } +} + +func TestStoreSnapshotRestoresDesiredNodeMetadata(t *testing.T) { + var patchBody map[string]any + kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes": + _ = json.NewEncoder(w).Encode(map[string]any{ + "items": []any{ + map[string]any{ + "metadata": map[string]any{ + "name": "titan-15", + "labels": map[string]string{ + "hardware": "rpi4", + "maintenance.bstein.dev/old": "1", + }, + "annotations": map[string]string{ + "maintenance.bstein.dev/mode": "old", + }, + }, + "spec": map[string]any{ + "unschedulable": true, + "taints": []any{ + map[string]any{"key": "dedicated", "value": "old", "effect": "NoSchedule"}, + map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"}, + }, + }, + }, + }, + }) + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/nodes/titan-15": + if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil { + t.Fatalf("decode patch: %v", err) + } + _ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"}) + default: + http.NotFound(w, r) + } + })) + defer kube.Close() + installKubeFactory(t, kube) + + app := newTestApp(t) + app.desiredMetadata["titan-15"] = DesiredNodeMetadata{ + Node: "titan-15", + Hostname: "titan-15", + Labels: map[string]string{"hardware": "rpi5"}, + Annotations: map[string]string{"maintenance.bstein.dev/mode": "recovery"}, + Taints: []string{"dedicated=recovery:NoSchedule"}, + Unschedulable: false, + } + + if err := app.StoreSnapshot(SnapshotRecord{ + Node: "titan-15", + CollectedAt: time.Date(2026, 4, 24, 6, 0, 0, 0, time.UTC), + Snapshot: sentinel.Snapshot{Hostname: "titan-15"}, + }); err != nil { + t.Fatalf("StoreSnapshot: %v", err) + } + if patchBody == nil { + t.Fatal("expected desired metadata patch") + } + metadata := patchBody["metadata"].(map[string]any) + labels := metadata["labels"].(map[string]any) + if labels["hardware"] != "rpi5" || labels["maintenance.bstein.dev/old"] != nil { + t.Fatalf("unexpected label patch: %#v", labels) + } + annotations := metadata["annotations"].(map[string]any) + if annotations["maintenance.bstein.dev/mode"] != "recovery" { + t.Fatalf("unexpected annotation patch: %#v", annotations) + } + spec := patchBody["spec"].(map[string]any) + if spec["unschedulable"] != false { + t.Fatalf("unexpected spec patch: %#v", spec) + } + taints := spec["taints"].([]any) + if len(taints) != 2 { + t.Fatalf("unexpected taint payload: %#v", taints) + } + entries := map[string]map[string]any{} + for _, raw := range taints { + entry := raw.(map[string]any) + key := entry["key"].(string) + entries[key] = entry + } + if entries["dedicated"]["value"] != "recovery" || entries["dedicated"]["effect"] != "NoSchedule" { + t.Fatalf("missing desired taint replacement: %#v", entries) + } + if entries["node.kubernetes.io/unreachable"]["effect"] != "NoExecute" { + t.Fatalf("system taint should be preserved: %#v", entries) + } +} + +func TestDesiredNodeMetadataHelpers(t *testing.T) { + app := newTestApp(t) + if _, ok := app.desiredMetadataForNode("missing"); ok { + t.Fatal("expected no desired metadata for missing node") + } + if err := app.syncDesiredNodeMetadata(SnapshotRecord{Node: "missing"}); err != nil { + t.Fatalf("syncDesiredNodeMetadata missing should noop: %v", err) + } + if _, ok := liveClusterNode(""); ok { + t.Fatal("empty liveClusterNode lookup should fail") + } + if !isRestorableLabel("maintenance.bstein.dev/role") || isRestorableLabel("kubernetes.io/arch") { + t.Fatal("unexpected label restoration filter") + } + if !isRestorableAnnotation("maintenance.bstein.dev/state") || isRestorableAnnotation("volumes.kubernetes.io/foo") { + t.Fatal("unexpected annotation restoration filter") + } + if !isRestorableTaint("dedicated=recovery:NoSchedule") || isRestorableTaint("node.kubernetes.io/not-ready:NoExecute") { + t.Fatal("unexpected taint restoration filter") + } + key, value, effect := splitTaint("dedicated=recovery:NoSchedule") + if key != "dedicated" || value != "recovery" || effect != "NoSchedule" { + t.Fatalf("splitTaint mismatch: %q %q %q", key, value, effect) + } + if key, value, effect := splitTaint("just-a-key"); key != "just-a-key" || value != "" || effect != "" { + t.Fatalf("splitTaint key-only mismatch: %q %q %q", key, value, effect) + } + labels := filteredRestorableLabels(map[string]string{"hardware": "rpi5", "kubernetes.io/arch": "arm64"}) + if !reflect.DeepEqual(labels, map[string]string{"hardware": "rpi5"}) { + t.Fatalf("filteredRestorableLabels = %#v", labels) + } + annotations := filteredRestorableAnnotations(map[string]string{"maintenance.bstein.dev/state": "ok", "volumes.kubernetes.io/foo": "bar"}) + if !reflect.DeepEqual(annotations, map[string]string{"maintenance.bstein.dev/state": "ok"}) { + t.Fatalf("filteredRestorableAnnotations = %#v", annotations) + } + patch := metadataStringPatch( + map[string]string{"hardware": "rpi4", "maintenance.bstein.dev/old": "1"}, + map[string]string{"hardware": "rpi5"}, + isRestorableLabel, + ) + if patch["hardware"] != "rpi5" || patch["maintenance.bstein.dev/old"] != nil { + t.Fatalf("metadataStringPatch = %#v", patch) + } + mergedTaints := mergeLiveAndDesiredTaints( + []string{"node.kubernetes.io/unreachable:NoExecute", "dedicated=old:NoSchedule"}, + []string{"dedicated=new:NoSchedule", "dedicated=new:NoSchedule"}, + ) + if !reflect.DeepEqual(mergedTaints, []string{"dedicated=new:NoSchedule", "node.kubernetes.io/unreachable:NoExecute"}) { + t.Fatalf("mergeLiveAndDesiredTaints = %#v", mergedTaints) + } + payload := taintPatchPayload([]string{"dedicated=new:NoSchedule"}) + if len(payload) != 1 || payload[0]["key"] != "dedicated" || payload[0]["value"] != "new" || payload[0]["effect"] != "NoSchedule" { + t.Fatalf("taintPatchPayload = %#v", payload) + } + original := DesiredNodeMetadata{Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}} + cloned := cloneDesiredNodeMetadata(original) + cloned.Labels["hardware"] = "mutated" + cloned.Taints[0] = "changed" + if original.Labels["hardware"] != "rpi5" || original.Taints[0] != "dedicated=new:NoSchedule" { + t.Fatalf("cloneDesiredNodeMetadata should deep-copy slices/maps: %#v %#v", original, cloned) + } + if err := patchDesiredNodeMetadata( + clusterNode{Name: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}, + DesiredNodeMetadata{Node: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}, + ); err != nil { + t.Fatalf("patchDesiredNodeMetadata should noop when already in sync: %v", err) + } + if event := desiredNodeMetadataSyncEvent("titan-15", os.ErrPermission); event.Kind != "sentinel.node-metadata" || event.Details["node"] != "titan-15" { + t.Fatalf("desiredNodeMetadataSyncEvent = %#v", event) + } +} diff --git a/pkg/service/remote.go b/pkg/service/remote.go index a1b29a0..ae70308 100644 --- a/pkg/service/remote.go +++ b/pkg/service/remote.go @@ -74,12 +74,17 @@ func (a *App) RefreshDevices(host string) ([]Device, error) { } func (a *App) runBuild(job *Job, flash bool) { - _, class, err := a.inventory.FindNode(job.Node) + nodeSpec, class, err := a.inventory.FindNode(job.Node) if err != nil { a.failJob(job.ID, err) a.metrics.RecordBuild(job.Node, "error") return } + if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } if err := a.ensureHarborProject(); err != nil { a.failJob(job.ID, err) a.metrics.RecordBuild(job.Node, "error") @@ -112,7 +117,8 @@ func (a *App) runBuild(job *Job, flash bool) { return } buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano()) - logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag)) + job.Builder = builder.Name + logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, strings.TrimSpace(nodeSpec.Hostname), artifactRef, buildTag)) if err != nil { a.failJob(job.ID, err) a.metrics.RecordBuild(job.Node, "error") @@ -183,6 +189,9 @@ func (a *App) runFlash(job *Job) { } func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) { + if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil { + return RemoteFlashResult{}, err + } a.setJob(job.ID, func(j *Job) { j.Status = JobRunning j.Stage = "preflight" diff --git a/pkg/service/remote_helpers.go b/pkg/service/remote_helpers.go index f520f52..a1f2ddd 100644 --- a/pkg/service/remote_helpers.go +++ b/pkg/service/remote_helpers.go @@ -1,6 +1,7 @@ package service import ( + "encoding/json" "fmt" "math" "path/filepath" @@ -246,8 +247,9 @@ func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any { } } -func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any { +func (a *App) remoteBuildPodSpec(name, host, image, node, nodeHostname, artifactRef, buildTag string) map[string]any { workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name) + desiredEnv := remoteDesiredMetadataEnv(a, node) return map[string]any{ "apiVersion": "v1", "kind": "Pod", @@ -255,7 +257,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag "name": name, "namespace": a.settings.Namespace, "labels": map[string]string{"app": "metis-remote", "metis-run": "build"}, - "annotations": vaultRuntimeAnnotations(true), + "annotations": vaultRuntimeAnnotations(true, nodeHostname), }, "spec": map[string]any{ "restartPolicy": "Never", @@ -283,6 +285,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag ), }, "securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0}, + "env": desiredEnv, "envFrom": []map[string]any{ {"configMapRef": map[string]any{"name": "metis"}}, }, @@ -309,7 +312,7 @@ func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef st "name": name, "namespace": a.settings.Namespace, "labels": map[string]string{"app": "metis-remote", "metis-run": "flash"}, - "annotations": vaultRuntimeAnnotations(false), + "annotations": vaultRuntimeAnnotations(false, ""), }, "spec": map[string]any{ "restartPolicy": "Never", @@ -378,7 +381,46 @@ func mountedHostTmpDir(path string) string { return "/host-tmp" } -func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string { +func remoteDesiredMetadataEnv(a *App, node string) []map[string]any { + desired, ok := a.desiredMetadataForNode(node) + if !ok { + return nil + } + labelsJSON, _ := jsonMarshalStringMap(desired.Labels) + taintsJSON, _ := jsonMarshalStringSlice(desired.Taints) + env := []map[string]any{} + if labelsJSON != "" { + env = append(env, map[string]any{"name": "METIS_NODE_LABELS_JSON", "value": labelsJSON}) + } + if taintsJSON != "" { + env = append(env, map[string]any{"name": "METIS_NODE_TAINTS_JSON", "value": taintsJSON}) + } + return env +} + +func jsonMarshalStringMap(values map[string]string) (string, error) { + if len(values) == 0 { + return "", nil + } + data, err := json.Marshal(values) + if err != nil { + return "", err + } + return string(data), nil +} + +func jsonMarshalStringSlice(values []string) (string, error) { + if len(values) == 0 { + return "", nil + } + data, err := json.Marshal(values) + if err != nil { + return "", err + } + return string(data), nil +} + +func vaultRuntimeAnnotations(includeSSHKeys bool, nodeHostname string) map[string]string { annotations := map[string]string{ "vault.hashicorp.com/agent-inject": "true", "vault.hashicorp.com/agent-pre-populate-only": "true", @@ -399,6 +441,19 @@ export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}" export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}" export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}" export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}" +{{ end }}` + } + nodeHostname = strings.TrimSpace(nodeHostname) + if nodeHostname != "" { + secretPath := fmt.Sprintf("secret/data/nodes/%s", nodeHostname) + annotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] = secretPath + annotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"] = `{{ with secret "` + secretPath + `" }} +export METIS_NODE_SSH_PASSWORD="{{ .Data.data.ssh_password }}" +export METIS_NODE_SSH_PASSWORD_HASH="{{ .Data.data.ssh_password_hash }}" +export METIS_NODE_ATLAS_PASSWORD="{{ .Data.data.atlas_password }}" +export METIS_NODE_ATLAS_PASSWORD_HASH="{{ .Data.data.atlas_password_hash }}" +export METIS_NODE_ROOT_PASSWORD="{{ .Data.data.root_password }}" +export METIS_NODE_ROOT_PASSWORD_HASH="{{ .Data.data.root_password_hash }}" {{ end }}` } return annotations @@ -413,6 +468,7 @@ func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string { if includeSSHKeys { lines = append(lines, ". /vault/secrets/metis-ssh-env.sh") } + lines = append(lines, "if [ -f /vault/secrets/metis-node-secrets-env.sh ]; then . /vault/secrets/metis-node-secrets-env.sh; fi") lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...)) return strings.Join(lines, "\n") } diff --git a/pkg/service/remote_helpers_test.go b/pkg/service/remote_helpers_test.go index f237d68..b45c5cc 100644 --- a/pkg/service/remote_helpers_test.go +++ b/pkg/service/remote_helpers_test.go @@ -251,8 +251,13 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) { app := newTestApp(t) app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace" app.settings.HostTmpDir = "/var/tmp/metis-flash-test" + app.desiredMetadata["titan-10"] = DesiredNodeMetadata{ + Node: "titan-10", + Labels: map[string]string{"hardware": "rpi5"}, + Taints: []string{"dedicated=recovery:NoSchedule"}, + } - buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "registry.example/metis/titan-10", "build-1") + buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "titan-10", "registry.example/metis/titan-10", "build-1") buildBody := buildSpec["spec"].(map[string]any) buildVolumes := buildBody["volumes"].([]map[string]any) workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any) @@ -260,6 +265,17 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) { t.Fatalf("build workspace hostPath = %v", got) } buildContainer := buildBody["containers"].([]map[string]any)[0] + buildEnv := buildContainer["env"].([]map[string]any) + if len(buildEnv) != 2 { + t.Fatalf("expected desired metadata env, got %#v", buildEnv) + } + metadataAnnotations := buildSpec["metadata"].(map[string]any)["annotations"].(map[string]string) + if metadataAnnotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] != "secret/data/nodes/titan-10" { + t.Fatalf("unexpected node secret annotation: %#v", metadataAnnotations) + } + if !strings.Contains(metadataAnnotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"], "METIS_NODE_ROOT_PASSWORD") { + t.Fatalf("expected node password exports in vault template: %#v", metadataAnnotations) + } buildSecurity := buildContainer["securityContext"].(map[string]any) if got := buildSecurity["runAsUser"]; got != 0 { t.Fatalf("build runAsUser = %v", got) diff --git a/pkg/service/remote_test.go b/pkg/service/remote_test.go index 5ae4c6b..5542f77 100644 --- a/pkg/service/remote_test.go +++ b/pkg/service/remote_test.go @@ -15,7 +15,7 @@ func TestMountedHostTmpDirMapsConfiguredTmpPathIntoMount(t *testing.T) { } func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) { - withKeys := vaultRuntimeAnnotations(true) + withKeys := vaultRuntimeAnnotations(true, "titan-15") template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"] if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") { t.Fatalf("expected tethys hecate key export in vault template: %q", template) @@ -24,7 +24,7 @@ func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) { t.Fatalf("expected db hecate key export in vault template: %q", template) } - withoutKeys := vaultRuntimeAnnotations(false) + withoutKeys := vaultRuntimeAnnotations(false, "") if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok { t.Fatalf("did not expect ssh key template when includeSSHKeys=false") } diff --git a/pkg/service/settings.go b/pkg/service/settings.go index b2acaa2..f3f5242 100644 --- a/pkg/service/settings.go +++ b/pkg/service/settings.go @@ -11,30 +11,31 @@ var hostNameLookup = os.Hostname // Settings configures the Metis service runtime. type Settings struct { - BindAddr string - InventoryPath string - CacheDir string - ArtifactDir string - ArtifactStatePath string - HistoryPath string - SnapshotsPath string - TargetsPath string - DefaultFlashHost string - FlashHosts []string - LocalHost string - AllowedGroups []string - MaxDeviceBytes int64 - Namespace string - RunnerImageAMD64 string - RunnerImageARM64 string - HarborRegistry string - HarborProject string - HarborAPIBase string - HarborUsername string - HarborPassword string - HostTmpDir string - RemoteWorkspaceDir string - RemotePodTimeout int64 + BindAddr string + InventoryPath string + CacheDir string + ArtifactDir string + ArtifactStatePath string + HistoryPath string + SnapshotsPath string + TargetsPath string + DesiredMetadataPath string + DefaultFlashHost string + FlashHosts []string + LocalHost string + AllowedGroups []string + MaxDeviceBytes int64 + Namespace string + RunnerImageAMD64 string + RunnerImageARM64 string + HarborRegistry string + HarborProject string + HarborAPIBase string + HarborUsername string + HarborPassword string + HostTmpDir string + RemoteWorkspaceDir string + RemotePodTimeout int64 } // FromEnv builds service settings with sensible defaults for local dev and in-cluster use. @@ -44,30 +45,31 @@ func FromEnv() Settings { defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost) flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost)) return Settings{ - BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"), - InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"), - CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")), - ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")), - ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")), - HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), - SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), - TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), - DefaultFlashHost: defaultFlashHost, - FlashHosts: flashHosts, - LocalHost: localHost, - AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")), - MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000), - Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"), - RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""), - RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""), - HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), - HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"), - HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"), - HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""), - HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""), - HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"), - RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"), - RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800), + BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"), + InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"), + CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")), + ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")), + ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")), + HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), + SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), + TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), + DesiredMetadataPath: getenvDefault("METIS_DESIRED_METADATA_PATH", filepath.Join(dataDir, "desired-node-metadata.json")), + DefaultFlashHost: defaultFlashHost, + FlashHosts: flashHosts, + LocalHost: localHost, + AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")), + MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000), + Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"), + RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""), + RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""), + HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), + HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"), + HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"), + HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""), + HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""), + HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"), + RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"), + RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800), } }