recovery(metis): restore node identity on rebuilt images

This commit is contained in:
codex 2026-04-24 16:57:34 -03:00
parent ebaa367efd
commit 17069e4677
19 changed files with 1612 additions and 144 deletions

View File

@ -154,6 +154,10 @@ if [ -s "${sudoers_file}" ]; then
fi fi
fi fi
if [ -x /usr/local/sbin/metis-apply-node-identity.sh ]; then
/usr/local/sbin/metis-apply-node-identity.sh || true
fi
rm -f /root/.not_logged_in_yet rm -f /root/.not_logged_in_yet
if ! command -v k3s >/dev/null 2>&1; then if ! command -v k3s >/dev/null 2>&1; then

View File

@ -2,8 +2,6 @@ package plan
import ( import (
"bytes" "bytes"
"context"
"encoding/json"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
@ -64,6 +62,7 @@ func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error)
cfg.Secrets = sec.Extra cfg.Secrets = sec.Extra
} }
} }
applyNodeMetadataEnv(cfg)
files, err := buildFiles(cfg, sec) files, err := buildFiles(cfg, sec)
if err != nil { if err != nil {
return nil, err return nil, err
@ -111,7 +110,9 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
{Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true}, {Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true},
{Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true}, {Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true},
{Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true}, {Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true},
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true}, {Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg, sec)), Mode: 0o600, RootFS: true},
{Path: "usr/local/sbin/metis-apply-node-identity.sh", Content: []byte(nodeIdentityScriptContent()), Mode: 0o755, RootFS: true},
{Path: "etc/cloud/cloud.cfg.d/90-metis-recovery.cfg", Content: []byte(cloudInitRootFSContent(sec)), Mode: 0o644, RootFS: true},
} }
if cfg.IP != "" { if cfg.IP != "" {
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -148,6 +149,14 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true, RootFS: true,
}) })
} }
if passwordAuth := sshPasswordConfigContent(sec); passwordAuth != "" {
files = append(files, inject.FileSpec{
Path: "etc/ssh/sshd_config.d/90-metis-password-auth.conf",
Content: []byte(passwordAuth),
Mode: 0o644,
RootFS: true,
})
}
if cfg.SSHUser == "atlas" { if cfg.SSHUser == "atlas" {
sudoers := hecateSudoersContent(cfg.SSHUser) sudoers := hecateSudoersContent(cfg.SSHUser)
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -172,8 +181,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
}) })
} }
// Store the raw config for debugging/ops. raw, err := jsonMarshalIndent(cfg)
raw, err := json.MarshalIndent(cfg, "", " ")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -184,7 +192,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true, RootFS: true,
}) })
if sec != nil { if sec != nil {
secRaw, err := json.MarshalIndent(sec, "", " ") secRaw, err := jsonMarshalIndent(redactedSecretsForImage(sec))
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -196,7 +204,6 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
}) })
} }
// Optional cloud-init for images that honor NoCloud.
userData := cloudInitUserData(cfg, sec) userData := cloudInitUserData(cfg, sec)
if userData != "" { if userData != "" {
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -267,33 +274,6 @@ func allowK3sNodeLabel(role, key string) bool {
return !strings.HasPrefix(key, "node-role.kubernetes.io/") return !strings.HasPrefix(key, "node-role.kubernetes.io/")
} }
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
return b.String()
}
func networkManagerConnectionContent(id, iface, ip string) string { func networkManagerConnectionContent(id, iface, ip string) string {
gateway := ip gateway := ip
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 { if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
@ -347,7 +327,6 @@ func fstabAppendContent(cfg *config.NodeConfig) string {
source := entry.Source source := entry.Source
switch { switch {
case source != "": case source != "":
// Use the explicit source path for bind mounts.
case entry.UUID != "": case entry.UUID != "":
source = "UUID=" + entry.UUID source = "UUID=" + entry.UUID
case entry.Label != "": case entry.Label != "":
@ -374,25 +353,6 @@ func hecateSudoersContent(user string) string {
) )
} }
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}
func fetchSecrets(hostname string) *secrets.NodeSecrets {
if os.Getenv("VAULT_ADDR") == "" {
return nil
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return nil
}
return sec
}
func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) { func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) {
var files []inject.FileSpec var files []inject.FileSpec
if class == nil { if class == nil {

View File

@ -174,3 +174,80 @@ func TestBuildFilesAddsHecateSudoersForAtlas(t *testing.T) {
t.Fatalf("metis sudoers backup missing/incorrect: %s", backup) t.Fatalf("metis sudoers backup missing/incorrect: %s", backup)
} }
} }
func TestBuildFilesAddsPasswordArtifactsAndRedactsSecrets(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
IP: "192.168.22.43",
SSHUser: "atlas",
SSHKeys: []string{"ssh-ed25519 AAA test"},
K3s: config.K3sConfig{
Role: "agent",
Version: "v1.31.5+k3s1",
},
}
sec := &secrets.NodeSecrets{
SSHPassword: "atlas-pass",
RootPassword: "root-pass",
K3sToken: "super-secret-token",
Extra: map[string]string{"api_key": "secret"},
}
files, err := buildFiles(cfg, sec)
if err != nil {
t.Fatalf("buildFiles: %v", err)
}
pathMap := map[string]string{}
for _, file := range files {
pathMap[file.Path] = string(file.Content)
}
firstboot := pathMap["etc/metis/firstboot.env"]
if !strings.Contains(firstboot, "METIS_ATLAS_PASSWORD='atlas-pass'") || !strings.Contains(firstboot, "METIS_ROOT_PASSWORD='root-pass'") {
t.Fatalf("firstboot env missing password material: %s", firstboot)
}
if sshd := pathMap["etc/ssh/sshd_config.d/90-metis-password-auth.conf"]; !strings.Contains(sshd, "PasswordAuthentication yes") || !strings.Contains(sshd, "PermitRootLogin yes") {
t.Fatalf("password auth config missing: %s", sshd)
}
if script := pathMap["usr/local/sbin/metis-apply-node-identity.sh"]; !strings.Contains(script, "apply_password root") || !strings.Contains(script, "METIS_ATLAS_PASSWORD") {
t.Fatalf("node identity script missing password application: %s", script)
}
if cloudCfg := pathMap["etc/cloud/cloud.cfg.d/90-metis-recovery.cfg"]; !strings.Contains(cloudCfg, "ssh_pwauth: true") {
t.Fatalf("cloud recovery config missing ssh_pwauth: %s", cloudCfg)
}
if userData := pathMap["user-data"]; !strings.Contains(userData, "ssh_pwauth: true") || !strings.Contains(userData, "metis-apply-node-identity.sh") {
t.Fatalf("cloud-init user-data missing recovery hooks: %s", userData)
}
secretsJSON := pathMap["etc/metis/secrets.json"]
if strings.Contains(secretsJSON, "atlas-pass") || strings.Contains(secretsJSON, "root-pass") || strings.Contains(secretsJSON, "super-secret-token") {
t.Fatalf("secrets.json should be redacted: %s", secretsJSON)
}
if !strings.Contains(secretsJSON, `"has_ssh_password": true`) || !strings.Contains(secretsJSON, `"extra_keys": [`) {
t.Fatalf("secrets.json should keep redacted debug metadata: %s", secretsJSON)
}
}
func TestApplyNodeMetadataEnv(t *testing.T) {
cfg := &config.NodeConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
K3s: config.K3sConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
},
}
t.Setenv("METIS_NODE_LABELS_JSON", `{"hardware":"rpi5","maintenance.bstein.dev/role":"recovery"}`)
t.Setenv("METIS_NODE_TAINTS_JSON", `["dedicated=recovery:NoSchedule","flash=true:NoSchedule"]`)
applyNodeMetadataEnv(cfg)
if cfg.Labels["hardware"] != "rpi5" || cfg.Labels["maintenance.bstein.dev/role"] != "recovery" {
t.Fatalf("applyNodeMetadataEnv labels = %#v", cfg.Labels)
}
if !strings.Contains(strings.Join(cfg.Taints, ","), "dedicated=recovery:NoSchedule") {
t.Fatalf("applyNodeMetadataEnv taints = %#v", cfg.Taints)
}
cfg = &config.NodeConfig{}
t.Setenv("METIS_NODE_LABELS_JSON", `{bad-json`)
t.Setenv("METIS_NODE_TAINTS_JSON", `{bad-json`)
applyNodeMetadataEnv(cfg)
if cfg.Labels != nil || cfg.Taints != nil {
t.Fatalf("invalid env JSON should be ignored: %#v", cfg)
}
}

262
pkg/plan/node_identity.go Normal file
View File

@ -0,0 +1,262 @@
package plan
import (
"bytes"
"fmt"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString("METIS_ATLAS_USER='atlas'\n")
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
if sec != nil {
if value := effectiveAtlasPassword(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD=%s\n", shellQuote(value)))
}
if value := effectiveAtlasPasswordHash(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD_HASH=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPassword); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPasswordHash); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD_HASH=%s\n", shellQuote(value)))
}
}
return b.String()
}
func cloudInitRootFSContent(sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString("#cloud-config\n")
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func nodeIdentityScriptContent() string {
return `#!/usr/bin/env bash
set -euo pipefail
marker="/var/lib/metis/node-identity-applied.done"
env_file="/etc/metis/firstboot.env"
key_file="/etc/metis/authorized_keys"
sudoers_file="/etc/metis/sudoers-hecate"
default_groups=(adm sudo tty disk dialout audio video plugdev games users systemd-journal input render netdev)
if [ -f "${marker}" ]; then
exit 0
fi
mkdir -p /var/lib/metis
if [ -f "${env_file}" ]; then
# shellcheck disable=SC1090
. "${env_file}"
fi
atlas_user="${METIS_ATLAS_USER:-atlas}"
ssh_user="${METIS_SSH_USER:-${atlas_user}}"
atlas_password="${METIS_ATLAS_PASSWORD:-}"
atlas_password_hash="${METIS_ATLAS_PASSWORD_HASH:-}"
root_password="${METIS_ROOT_PASSWORD:-}"
root_password_hash="${METIS_ROOT_PASSWORD_HASH:-}"
group_list=()
for group_name in "${default_groups[@]}"; do
if getent group "${group_name}" >/dev/null 2>&1; then
group_list+=("${group_name}")
fi
done
if [ "${#group_list[@]}" -gt 0 ]; then
group_csv="$(IFS=,; printf '%s' "${group_list[*]}")"
else
group_csv=""
fi
ensure_user() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
if ! id "${user_name}" >/dev/null 2>&1; then
if [ -n "${group_csv}" ]; then
useradd -m -s /bin/bash -G "${group_csv}" "${user_name}"
else
useradd -m -s /bin/bash "${user_name}"
fi
elif [ -n "${group_csv}" ]; then
usermod -a -G "${group_csv}" "${user_name}" || true
fi
}
apply_password() {
local user_name="$1"
local plain_password="$2"
local hash_password="$3"
if ! id "${user_name}" >/dev/null 2>&1; then
return 0
fi
if [ -n "${hash_password}" ]; then
usermod -p "${hash_password}" "${user_name}"
passwd -u "${user_name}" >/dev/null 2>&1 || true
return 0
fi
if [ -n "${plain_password}" ]; then
printf '%s:%s\n' "${user_name}" "${plain_password}" | chpasswd
passwd -u "${user_name}" >/dev/null 2>&1 || true
fi
}
install_keys() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
[ -s "${key_file}" ] || return 0
local home_dir
home_dir="$(getent passwd "${user_name}" | cut -d: -f6)"
if [ -z "${home_dir}" ]; then
if [ "${user_name}" = "root" ]; then
home_dir="/root"
else
home_dir="/home/${user_name}"
fi
fi
install -d -m 700 "${home_dir}/.ssh"
install -m 600 "${key_file}" "${home_dir}/.ssh/authorized_keys"
chown -R "${user_name}:${user_name}" "${home_dir}/.ssh" 2>/dev/null || true
}
ensure_user "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
ensure_user "${ssh_user}"
fi
apply_password root "${root_password}" "${root_password_hash}"
apply_password "${atlas_user}" "${atlas_password}" "${atlas_password_hash}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
apply_password "${ssh_user}" "${atlas_password}" "${atlas_password_hash}"
fi
if [ -s "${key_file}" ]; then
install_keys root
install_keys "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
install_keys "${ssh_user}"
fi
fi
if [ -s "${sudoers_file}" ]; then
install -d -m 755 /etc/sudoers.d
install -m 440 "${sudoers_file}" /etc/sudoers.d/90-hecate-atlas
if command -v visudo >/dev/null 2>&1; then
visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null 2>&1 || rm -f /etc/sudoers.d/90-hecate-atlas
fi
fi
systemctl restart ssh.service >/dev/null 2>&1 || systemctl restart sshd.service >/dev/null 2>&1 || systemctl restart ssh.socket >/dev/null 2>&1 || true
touch "${marker}"
`
}
func sshPasswordConfigContent(sec *secrets.NodeSecrets) string {
if !hasNodePasswords(sec) {
return ""
}
return "PasswordAuthentication yes\nKbdInteractiveAuthentication no\nChallengeResponseAuthentication no\nPermitRootLogin yes\nUsePAM yes\n"
}
func hasNodePasswords(sec *secrets.NodeSecrets) bool {
if sec == nil {
return false
}
return effectiveAtlasPassword(sec) != "" || effectiveAtlasPasswordHash(sec) != "" || firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != ""
}
func effectiveAtlasPassword(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPassword, sec.SSHPassword)
}
func effectiveAtlasPasswordHash(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPasswordHash, sec.SSHPasswordHash)
}
func firstNonEmptyString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
func redactedSecretsForImage(sec *secrets.NodeSecrets) map[string]any {
if sec == nil {
return nil
}
debug := map[string]any{
"has_ssh_password": firstNonEmptyString(sec.SSHPassword, sec.SSHPasswordHash) != "",
"has_atlas_password": firstNonEmptyString(sec.AtlasPassword, sec.AtlasPasswordHash) != "",
"has_root_password": firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "",
"has_k3s_token": strings.TrimSpace(sec.K3sToken) != "",
"has_cloud_init_override": strings.TrimSpace(sec.CloudInit) != "",
}
if len(sec.Extra) > 0 {
keys := make([]string, 0, len(sec.Extra))
for key := range sec.Extra {
key = strings.TrimSpace(key)
if key == "" {
continue
}
keys = append(keys, key)
}
sort.Strings(keys)
debug["extra_keys"] = keys
}
return debug
}
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}

133
pkg/plan/node_metadata.go Normal file
View File

@ -0,0 +1,133 @@
package plan
import (
"context"
"encoding/json"
"os"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func fetchSecrets(hostname string) *secrets.NodeSecrets {
envSecrets := nodeSecretsFromEnv()
if os.Getenv("VAULT_ADDR") == "" {
return envSecrets
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return envSecrets
}
return mergeNodeSecrets(sec, envSecrets)
}
func nodeSecretsFromEnv() *secrets.NodeSecrets {
sec := &secrets.NodeSecrets{
SSHPassword: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD")),
SSHPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD_HASH")),
AtlasPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD")),
AtlasPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD_HASH")),
RootPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD")),
RootPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD_HASH")),
}
if sec.SSHPassword == "" && sec.SSHPasswordHash == "" && sec.AtlasPassword == "" && sec.AtlasPasswordHash == "" && sec.RootPassword == "" && sec.RootPasswordHash == "" {
return nil
}
return sec
}
func mergeNodeSecrets(base, override *secrets.NodeSecrets) *secrets.NodeSecrets {
if base == nil {
return override
}
if override == nil {
return base
}
merged := *base
merged.SSHPassword = firstNonEmptyString(override.SSHPassword, base.SSHPassword)
merged.SSHPasswordHash = firstNonEmptyString(override.SSHPasswordHash, base.SSHPasswordHash)
merged.AtlasPassword = firstNonEmptyString(override.AtlasPassword, base.AtlasPassword)
merged.AtlasPasswordHash = firstNonEmptyString(override.AtlasPasswordHash, base.AtlasPasswordHash)
merged.RootPassword = firstNonEmptyString(override.RootPassword, base.RootPassword)
merged.RootPasswordHash = firstNonEmptyString(override.RootPasswordHash, base.RootPasswordHash)
merged.K3sToken = firstNonEmptyString(override.K3sToken, base.K3sToken)
merged.CloudInit = firstNonEmptyString(override.CloudInit, base.CloudInit)
if len(base.Extra) > 0 || len(override.Extra) > 0 {
merged.Extra = map[string]string{}
for key, value := range base.Extra {
merged.Extra[key] = value
}
for key, value := range override.Extra {
merged.Extra[key] = value
}
}
return &merged
}
func applyNodeMetadataEnv(cfg *config.NodeConfig) {
if cfg == nil {
return
}
if labels := parseEnvJSONMap(os.Getenv("METIS_NODE_LABELS_JSON")); len(labels) > 0 {
if cfg.Labels == nil {
cfg.Labels = map[string]string{}
}
for key, value := range labels {
cfg.Labels[key] = value
}
cfg.K3s.Labels = cfg.Labels
}
if taints := parseEnvJSONList(os.Getenv("METIS_NODE_TAINTS_JSON")); len(taints) > 0 {
cfg.Taints = uniqueStrings(append(cfg.Taints, taints...))
cfg.K3s.Taints = append([]string{}, cfg.Taints...)
}
}
func parseEnvJSONMap(raw string) map[string]string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values map[string]string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func parseEnvJSONList(raw string) []string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values []string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func uniqueStrings(values []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
return out
}
func jsonMarshalIndent(value any) ([]byte, error) {
return json.MarshalIndent(value, "", " ")
}

View File

@ -0,0 +1,127 @@
package plan
import (
"reflect"
"strings"
"testing"
"metis/pkg/config"
"metis/pkg/secrets"
)
func TestNodeSecretHelpers(t *testing.T) {
if got := effectiveAtlasPassword(nil); got != "" {
t.Fatalf("effectiveAtlasPassword(nil) = %q", got)
}
if got := effectiveAtlasPasswordHash(nil); got != "" {
t.Fatalf("effectiveAtlasPasswordHash(nil) = %q", got)
}
sec := &secrets.NodeSecrets{SSHPassword: "ssh-pass", SSHPasswordHash: "$ssh$hash"}
if got := effectiveAtlasPassword(sec); got != "ssh-pass" {
t.Fatalf("effectiveAtlasPassword fallback = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$ssh$hash" {
t.Fatalf("effectiveAtlasPasswordHash fallback = %q", got)
}
sec.AtlasPassword = "atlas-pass"
sec.AtlasPasswordHash = "$atlas$hash"
if got := effectiveAtlasPassword(sec); got != "atlas-pass" {
t.Fatalf("effectiveAtlasPassword explicit = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$atlas$hash" {
t.Fatalf("effectiveAtlasPasswordHash explicit = %q", got)
}
if got := firstNonEmptyString("", " value ", "ignored"); got != "value" {
t.Fatalf("firstNonEmptyString = %q", got)
}
if !hasNodePasswords(&secrets.NodeSecrets{RootPasswordHash: "$root$hash"}) {
t.Fatal("expected root password hash to count as password material")
}
if hasNodePasswords(&secrets.NodeSecrets{}) {
t.Fatal("empty node secrets should not count as password material")
}
debug := redactedSecretsForImage(&secrets.NodeSecrets{Extra: map[string]string{"b": "2", "a": "1"}})
if !reflect.DeepEqual(debug["extra_keys"], []string{"a", "b"}) {
t.Fatalf("redactedSecretsForImage extra_keys = %#v", debug)
}
}
func TestNodeSecretsFromEnvAndMergeNodeSecrets(t *testing.T) {
t.Setenv("METIS_NODE_SSH_PASSWORD", "ssh-pass")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "$ssh$hash")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "atlas-pass")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "$atlas$hash")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "root-pass")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "$root$hash")
envSecrets := nodeSecretsFromEnv()
if envSecrets == nil || envSecrets.RootPassword != "root-pass" || envSecrets.AtlasPasswordHash != "$atlas$hash" {
t.Fatalf("nodeSecretsFromEnv = %#v", envSecrets)
}
merged := mergeNodeSecrets(&secrets.NodeSecrets{
SSHPassword: "base-ssh",
K3sToken: "base-token",
CloudInit: "base-cloud",
Extra: map[string]string{"base": "1"},
}, &secrets.NodeSecrets{
AtlasPassword: "override-atlas",
RootPassword: "override-root",
K3sToken: "override-token",
CloudInit: "override-cloud",
Extra: map[string]string{"override": "2"},
})
if merged.K3sToken != "override-token" || merged.CloudInit != "override-cloud" || merged.AtlasPassword != "override-atlas" || merged.RootPassword != "override-root" {
t.Fatalf("mergeNodeSecrets = %#v", merged)
}
if merged.Extra["base"] != "1" || merged.Extra["override"] != "2" {
t.Fatalf("mergeNodeSecrets extras = %#v", merged.Extra)
}
if got := mergeNodeSecrets(nil, envSecrets); got.RootPasswordHash != "$root$hash" {
t.Fatalf("mergeNodeSecrets nil base = %#v", got)
}
if got := mergeNodeSecrets(envSecrets, nil); got.SSHPassword != "ssh-pass" {
t.Fatalf("mergeNodeSecrets nil override = %#v", got)
}
t.Setenv("METIS_NODE_SSH_PASSWORD", "")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "")
if got := nodeSecretsFromEnv(); got != nil {
t.Fatalf("expected empty env secrets to collapse to nil, got %#v", got)
}
}
func TestFirstbootEnvContentIncludesHashes(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
SSHUser: "atlas",
K3s: config.K3sConfig{Version: "v1.31.5+k3s1"},
}
content := firstbootEnvContent(cfg, &secrets.NodeSecrets{
AtlasPasswordHash: "$atlas$hash",
RootPasswordHash: "$root$hash",
})
if !reflect.DeepEqual(parseEnvLines(content), map[string]string{
"METIS_HOSTNAME": "'titan-15'",
"METIS_SSH_USER": "'atlas'",
"METIS_ATLAS_USER": "'atlas'",
"METIS_K3S_VERSION": "'v1.31.5+k3s1'",
"METIS_ATLAS_PASSWORD_HASH": "'$atlas$hash'",
"METIS_ROOT_PASSWORD_HASH": "'$root$hash'",
}) {
t.Fatalf("firstbootEnvContent = %q", content)
}
}
func parseEnvLines(raw string) map[string]string {
result := map[string]string{}
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
continue
}
result[parts[0]] = parts[1]
}
return result
}

View File

@ -16,6 +16,11 @@ import (
// These should live in Vault at secret/data/nodes/<hostname>. // These should live in Vault at secret/data/nodes/<hostname>.
type NodeSecrets struct { type NodeSecrets struct {
SSHPassword string `json:"ssh_password,omitempty"` SSHPassword string `json:"ssh_password,omitempty"`
SSHPasswordHash string `json:"ssh_password_hash,omitempty"`
AtlasPassword string `json:"atlas_password,omitempty"`
AtlasPasswordHash string `json:"atlas_password_hash,omitempty"`
RootPassword string `json:"root_password,omitempty"`
RootPasswordHash string `json:"root_password_hash,omitempty"`
K3sToken string `json:"k3s_token,omitempty"` K3sToken string `json:"k3s_token,omitempty"`
CloudInit string `json:"cloud_init,omitempty"` CloudInit string `json:"cloud_init,omitempty"`
Extra map[string]string `json:"extra,omitempty"` Extra map[string]string `json:"extra,omitempty"`

View File

@ -17,6 +17,8 @@ func TestFetchNodeReturnsData(t *testing.T) {
"data": map[string]any{ "data": map[string]any{
"data": map[string]any{ "data": map[string]any{
"ssh_password": "p1", "ssh_password": "p1",
"atlas_password_hash": "$atlas$hash",
"root_password": "root-pw",
"k3s_token": "t1", "k3s_token": "t1",
"cloud_init": "ci", "cloud_init": "ci",
}, },
@ -33,7 +35,7 @@ func TestFetchNodeReturnsData(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("fetch: %v", err) t.Fatalf("fetch: %v", err)
} }
if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" { if sec.SSHPassword != "p1" || sec.AtlasPasswordHash != "$atlas$hash" || sec.RootPassword != "root-pw" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
t.Fatalf("unexpected secrets: %+v", sec) t.Fatalf("unexpected secrets: %+v", sec)
} }
} }

View File

@ -123,10 +123,18 @@ type App struct {
targets map[string]facts.Targets targets map[string]facts.Targets
artifactStore map[string]ArtifactSummary artifactStore map[string]ArtifactSummary
deviceStore map[string]deviceSnapshot deviceStore map[string]deviceSnapshot
desiredMetadata map[string]DesiredNodeMetadata
} }
// NewApp creates a Metis service app instance. // NewApp creates a Metis service app instance.
func NewApp(settings Settings) (*App, error) { func NewApp(settings Settings) (*App, error) {
if strings.TrimSpace(settings.DesiredMetadataPath) == "" {
baseDir := filepath.Dir(settings.SnapshotsPath)
if strings.TrimSpace(baseDir) == "" || baseDir == "." {
baseDir = filepath.Dir(settings.HistoryPath)
}
settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json")
}
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil { if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
return nil, err return nil, err
} }
@ -149,10 +157,12 @@ func NewApp(settings Settings) (*App, error) {
targets: map[string]facts.Targets{}, targets: map[string]facts.Targets{},
artifactStore: map[string]ArtifactSummary{}, artifactStore: map[string]ArtifactSummary{},
deviceStore: map[string]deviceSnapshot{}, deviceStore: map[string]deviceSnapshot{},
desiredMetadata: map[string]DesiredNodeMetadata{},
} }
_ = app.loadSnapshots() _ = app.loadSnapshots()
_ = app.loadTargets() _ = app.loadTargets()
_ = app.loadArtifacts() _ = app.loadArtifacts()
_ = app.loadDesiredNodeMetadata()
return app, nil return app, nil
} }
@ -289,6 +299,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
if err := a.syncScratchAnnotations(record); err != nil { if err := a.syncScratchAnnotations(record); err != nil {
a.appendEvent(annotationSyncEvent(record.Node, err)) a.appendEvent(annotationSyncEvent(record.Node, err))
} }
if err := a.syncDesiredNodeMetadata(record); err != nil {
a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err))
}
a.appendEvent(Event{ a.appendEvent(Event{
Time: record.CollectedAt, Time: record.CollectedAt,
Kind: "sentinel.snapshot", Kind: "sentinel.snapshot",

View File

@ -22,6 +22,9 @@ type clusterNode struct {
Worker bool Worker bool
ControlPlane bool ControlPlane bool
Unschedulable bool Unschedulable bool
Labels map[string]string
Annotations map[string]string
Taints []string
USBScratchStatus string USBScratchStatus string
USBScratchManagedPaths string USBScratchManagedPaths string
} }
@ -179,6 +182,11 @@ func clusterNodes() []clusterNode {
} `json:"metadata"` } `json:"metadata"`
Spec struct { Spec struct {
Unschedulable bool `json:"unschedulable"` Unschedulable bool `json:"unschedulable"`
Taints []struct {
Key string `json:"key"`
Value string `json:"value"`
Effect string `json:"effect"`
} `json:"taints"`
} `json:"spec"` } `json:"spec"`
} `json:"items"` } `json:"items"`
} }
@ -189,6 +197,28 @@ func clusterNodes() []clusterNode {
for _, item := range payload.Items { for _, item := range payload.Items {
labels := item.Metadata.Labels labels := item.Metadata.Labels
annotations := item.Metadata.Annotations annotations := item.Metadata.Annotations
if labels == nil {
labels = map[string]string{}
}
if annotations == nil {
annotations = map[string]string{}
}
taints := make([]string, 0, len(item.Spec.Taints))
for _, taint := range item.Spec.Taints {
key := strings.TrimSpace(taint.Key)
if key == "" {
continue
}
raw := key
if value := strings.TrimSpace(taint.Value); value != "" {
raw += "=" + value
}
if effect := strings.TrimSpace(taint.Effect); effect != "" {
raw += ":" + effect
}
taints = append(taints, raw)
}
sort.Strings(taints)
nodes = append(nodes, clusterNode{ nodes = append(nodes, clusterNode{
Name: strings.TrimSpace(item.Metadata.Name), Name: strings.TrimSpace(item.Metadata.Name),
Arch: strings.TrimSpace(labels["kubernetes.io/arch"]), Arch: strings.TrimSpace(labels["kubernetes.io/arch"]),
@ -196,6 +226,9 @@ func clusterNodes() []clusterNode {
Worker: labels["node-role.kubernetes.io/worker"] == "true", Worker: labels["node-role.kubernetes.io/worker"] == "true",
ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "", ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "",
Unschedulable: item.Spec.Unschedulable, Unschedulable: item.Spec.Unschedulable,
Labels: labels,
Annotations: annotations,
Taints: taints,
USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]), USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]),
USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]), USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]),
}) })

View File

@ -37,6 +37,11 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
t.Fatal("expected persistTargets to fail when parent is a file") t.Fatal("expected persistTargets to fail when parent is a file")
} }
app.settings.DesiredMetadataPath = filepath.Join(fileParent, "desired-node-metadata.json")
if err := app.persistDesiredNodeMetadata(); err == nil {
t.Fatal("expected persistDesiredNodeMetadata to fail when parent is a file")
}
invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json") invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json")
if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil { if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err) t.Fatal(err)
@ -45,6 +50,15 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
if err := app.loadArtifacts(); err == nil { if err := app.loadArtifacts(); err == nil {
t.Fatal("expected loadArtifacts to reject invalid json") t.Fatal("expected loadArtifacts to reject invalid json")
} }
invalidDesiredState := filepath.Join(t.TempDir(), "desired-node-metadata.json")
if err := os.WriteFile(invalidDesiredState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err)
}
app.settings.DesiredMetadataPath = invalidDesiredState
if err := app.loadDesiredNodeMetadata(); err == nil {
t.Fatal("expected loadDesiredNodeMetadata to reject invalid json")
}
} }
func TestServiceReplacementAndDeviceBranches(t *testing.T) { func TestServiceReplacementAndDeviceBranches(t *testing.T) {

View File

@ -162,6 +162,7 @@ nodes:
snapshotsPath := filepath.Join(dir, "snapshots.json") snapshotsPath := filepath.Join(dir, "snapshots.json")
targetsPath := filepath.Join(dir, "targets.json") targetsPath := filepath.Join(dir, "targets.json")
artifactStatePath := filepath.Join(dir, "artifacts.json") artifactStatePath := filepath.Join(dir, "artifacts.json")
desiredMetadataPath := filepath.Join(dir, "desired-node-metadata.json")
seedSnapshots := map[string]SnapshotRecord{ seedSnapshots := map[string]SnapshotRecord{
"titan-15": { "titan-15": {
@ -190,6 +191,19 @@ nodes:
if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil { if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
seedDesiredMetadata := map[string]DesiredNodeMetadata{
"titan-15": {
Node: "titan-15",
Hostname: "titan-15",
CapturedAt: testTime(t),
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
},
}
data, _ = json.MarshalIndent(seedDesiredMetadata, "", " ")
if err := os.WriteFile(desiredMetadataPath, data, 0o644); err != nil {
t.Fatal(err)
}
app, err := NewApp(Settings{ app, err := NewApp(Settings{
InventoryPath: invPath, InventoryPath: invPath,
@ -199,6 +213,7 @@ nodes:
HistoryPath: filepath.Join(dir, "history.jsonl"), HistoryPath: filepath.Join(dir, "history.jsonl"),
SnapshotsPath: snapshotsPath, SnapshotsPath: snapshotsPath,
TargetsPath: targetsPath, TargetsPath: targetsPath,
DesiredMetadataPath: desiredMetadataPath,
DefaultFlashHost: "titan-22", DefaultFlashHost: "titan-22",
FlashHosts: []string{"titan-22"}, FlashHosts: []string{"titan-22"},
LocalHost: "titan-22", LocalHost: "titan-22",
@ -211,6 +226,9 @@ nodes:
if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" { if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" {
t.Fatalf("artifacts() = %q", got) t.Fatalf("artifacts() = %q", got)
} }
if desired, ok := app.desiredMetadataForNode("titan-15"); !ok || desired.Labels["hardware"] != "rpi5" {
t.Fatalf("desiredMetadataForNode() = %#v ok=%v", desired, ok)
}
if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil { if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil {
t.Fatalf("recordArtifact: %v", err) t.Fatalf("recordArtifact: %v", err)
} }

View File

@ -0,0 +1,483 @@
package service
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"metis/pkg/config"
)
// DesiredNodeMetadata captures the node identity Metis should preserve through
// recovery builds and re-assert after the node rejoins the cluster.
type DesiredNodeMetadata struct {
Node string `json:"node"`
Hostname string `json:"hostname,omitempty"`
CapturedAt time.Time `json:"captured_at,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Taints []string `json:"taints,omitempty"`
Unschedulable bool `json:"unschedulable,omitempty"`
}
func (a *App) loadDesiredNodeMetadata() error {
data, err := os.ReadFile(a.settings.DesiredMetadataPath)
if err != nil {
return err
}
var desired map[string]DesiredNodeMetadata
if err := json.Unmarshal(data, &desired); err != nil {
return err
}
a.mu.Lock()
a.desiredMetadata = desired
a.mu.Unlock()
return nil
}
func (a *App) persistDesiredNodeMetadata() error {
a.mu.RLock()
data, err := json.MarshalIndent(a.desiredMetadata, "", " ")
a.mu.RUnlock()
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(a.settings.DesiredMetadataPath), 0o755); err != nil {
return err
}
return os.WriteFile(a.settings.DesiredMetadataPath, data, 0o644)
}
func (a *App) desiredMetadataForNode(node string) (DesiredNodeMetadata, bool) {
node = strings.TrimSpace(node)
if node == "" {
return DesiredNodeMetadata{}, false
}
a.mu.RLock()
defer a.mu.RUnlock()
desired, ok := a.desiredMetadata[node]
if !ok {
return DesiredNodeMetadata{}, false
}
return cloneDesiredNodeMetadata(desired), true
}
func (a *App) stageDesiredNodeMetadata(nodeName string) (DesiredNodeMetadata, error) {
nodeName = strings.TrimSpace(nodeName)
if nodeName == "" {
return DesiredNodeMetadata{}, fmt.Errorf("node metadata requires a node name")
}
nodeSpec, _, err := a.inventory.FindNode(nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
cfg, err := config.Build(a.inventory, nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
desired := DesiredNodeMetadata{
Node: nodeName,
Hostname: strings.TrimSpace(nodeSpec.Hostname),
CapturedAt: time.Now().UTC(),
Labels: filteredRestorableLabels(cfg.Labels),
Taints: restorableTaints(cfg.Taints),
}
if existing, ok := a.desiredMetadataForNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, existing)
}
if live, ok := liveClusterNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, desiredMetadataFromCluster(*live))
}
desired.Labels = normalizeStringMap(desired.Labels)
desired.Annotations = normalizeStringMap(desired.Annotations)
desired.Taints = normalizeTaints(desired.Taints)
a.mu.Lock()
if a.desiredMetadata == nil {
a.desiredMetadata = map[string]DesiredNodeMetadata{}
}
a.desiredMetadata[nodeName] = desired
a.mu.Unlock()
if err := a.persistDesiredNodeMetadata(); err != nil {
return DesiredNodeMetadata{}, err
}
return cloneDesiredNodeMetadata(desired), nil
}
func (a *App) syncDesiredNodeMetadata(record SnapshotRecord) error {
desired, ok := a.desiredMetadataForNode(record.Node)
if !ok {
return nil
}
live, ok := liveClusterNode(record.Node)
if !ok {
return nil
}
return patchDesiredNodeMetadata(*live, desired)
}
func desiredMetadataFromCluster(node clusterNode) DesiredNodeMetadata {
return DesiredNodeMetadata{
Node: strings.TrimSpace(node.Name),
Labels: filteredRestorableLabels(node.Labels),
Annotations: filteredRestorableAnnotations(node.Annotations),
Taints: restorableTaints(node.Taints),
Unschedulable: node.Unschedulable,
}
}
func mergeDesiredNodeMetadata(base, overlay DesiredNodeMetadata) DesiredNodeMetadata {
merged := cloneDesiredNodeMetadata(base)
if hostname := strings.TrimSpace(overlay.Hostname); hostname != "" {
merged.Hostname = hostname
}
if !overlay.CapturedAt.IsZero() {
merged.CapturedAt = overlay.CapturedAt
}
if merged.Labels == nil {
merged.Labels = map[string]string{}
}
for key, value := range overlay.Labels {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Labels[key] = strings.TrimSpace(value)
}
if merged.Annotations == nil {
merged.Annotations = map[string]string{}
}
for key, value := range overlay.Annotations {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Annotations[key] = strings.TrimSpace(value)
}
if len(overlay.Taints) > 0 {
merged.Taints = normalizeTaints(overlay.Taints)
}
merged.Unschedulable = overlay.Unschedulable
return merged
}
func patchDesiredNodeMetadata(live clusterNode, desired DesiredNodeMetadata) error {
node := strings.TrimSpace(desired.Node)
if node == "" {
node = strings.TrimSpace(live.Name)
}
if node == "" {
return nil
}
labelPatch := metadataStringPatch(live.Labels, desired.Labels, isRestorableLabel)
annotationPatch := metadataStringPatch(live.Annotations, desired.Annotations, isRestorableAnnotation)
mergedTaints := mergeLiveAndDesiredTaints(live.Taints, desired.Taints)
body := map[string]any{}
metadata := map[string]any{}
if len(labelPatch) > 0 {
metadata["labels"] = labelPatch
}
if len(annotationPatch) > 0 {
metadata["annotations"] = annotationPatch
}
if len(metadata) > 0 {
body["metadata"] = metadata
}
spec := map[string]any{}
if live.Unschedulable != desired.Unschedulable {
spec["unschedulable"] = desired.Unschedulable
}
if !sameTaints(live.Taints, mergedTaints) {
spec["taints"] = taintPatchPayload(mergedTaints)
}
if len(spec) > 0 {
body["spec"] = spec
}
if len(body) == 0 {
return nil
}
kube, err := kubeClientFactory()
if err != nil {
return err
}
return kube.mergePatch("/api/v1/nodes/"+node, body)
}
func metadataStringPatch(live, desired map[string]string, allow func(string) bool) map[string]any {
patch := map[string]any{}
for key, value := range desired {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
value = strings.TrimSpace(value)
if strings.TrimSpace(live[key]) != value {
patch[key] = value
}
}
for key := range live {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
if _, ok := desired[key]; !ok {
patch[key] = nil
}
}
return patch
}
func liveClusterNode(node string) (*clusterNode, bool) {
node = strings.TrimSpace(node)
if node == "" {
return nil, false
}
for _, live := range clusterNodes() {
if strings.TrimSpace(live.Name) == node {
copyNode := live
return &copyNode, true
}
}
return nil, false
}
func filteredRestorableLabels(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableLabel(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func filteredRestorableAnnotations(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableAnnotation(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func normalizeStringMap(values map[string]string) map[string]string {
if len(values) == 0 {
return nil
}
normalized := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" {
continue
}
normalized[key] = strings.TrimSpace(value)
}
if len(normalized) == 0 {
return nil
}
return normalized
}
func restorableTaints(values []string) []string {
filtered := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" || !isRestorableTaint(value) {
continue
}
filtered = append(filtered, value)
}
return normalizeTaints(filtered)
}
func normalizeTaints(values []string) []string {
if len(values) == 0 {
return nil
}
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
if len(out) == 0 {
return nil
}
return out
}
func normalizeTaint(value string) string {
return strings.TrimSpace(value)
}
func sameTaints(left, right []string) bool {
left = normalizeTaints(left)
right = normalizeTaints(right)
if len(left) != len(right) {
return false
}
for idx := range left {
if left[idx] != right[idx] {
return false
}
}
return true
}
func mergeLiveAndDesiredTaints(live, desired []string) []string {
merged := make([]string, 0, len(live)+len(desired))
for _, taint := range live {
taint = normalizeTaint(taint)
if taint == "" || isRestorableTaint(taint) {
continue
}
merged = append(merged, taint)
}
merged = append(merged, restorableTaints(desired)...)
return normalizeTaints(merged)
}
func taintPatchPayload(values []string) []map[string]string {
payload := make([]map[string]string, 0, len(values))
for _, value := range normalizeTaints(values) {
key, taintValue, effect := splitTaint(value)
if key == "" {
continue
}
entry := map[string]string{"key": key}
if taintValue != "" {
entry["value"] = taintValue
}
if effect != "" {
entry["effect"] = effect
}
payload = append(payload, entry)
}
return payload
}
func splitTaint(raw string) (string, string, string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", "", ""
}
effect := ""
body := raw
if idx := strings.LastIndex(raw, ":"); idx >= 0 {
body = strings.TrimSpace(raw[:idx])
effect = strings.TrimSpace(raw[idx+1:])
}
key := body
value := ""
if idx := strings.Index(body, "="); idx >= 0 {
key = strings.TrimSpace(body[:idx])
value = strings.TrimSpace(body[idx+1:])
}
return strings.TrimSpace(key), value, effect
}
func isRestorableTaint(raw string) bool {
key, _, _ := splitTaint(raw)
if key == "" {
return false
}
for _, prefix := range []string{
"node.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
"ToBeDeletedByClusterAutoscaler",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableLabel(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
if strings.HasPrefix(key, "node-role.kubernetes.io/") {
return true
}
for _, prefix := range []string{
"kubernetes.io/",
"beta.kubernetes.io/",
"node.kubernetes.io/",
"topology.kubernetes.io/",
"feature.node.kubernetes.io/",
"failure-domain.beta.kubernetes.io/",
"nvidia.com/",
"k3s.io/",
"rke2.io/",
"volumes.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableAnnotation(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
for _, prefix := range []string{
"kubectl.kubernetes.io/",
"kubeadm.alpha.kubernetes.io/",
"kubernetes.io/",
"node.alpha.kubernetes.io/",
"node.kubernetes.io/",
"volumes.kubernetes.io/",
"csi.volume.kubernetes.io/",
"csi.storage.k8s.io/",
"flannel.alpha.coreos.com/",
"projectcalico.org/",
"rke2.io/",
"k3s.io/",
"nvidia.com/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func cloneDesiredNodeMetadata(value DesiredNodeMetadata) DesiredNodeMetadata {
clone := value
clone.Labels = normalizeStringMap(value.Labels)
clone.Annotations = normalizeStringMap(value.Annotations)
clone.Taints = normalizeTaints(value.Taints)
return clone
}
func desiredNodeMetadataSyncEvent(node string, err error) Event {
return Event{
Time: time.Now().UTC(),
Kind: "sentinel.node-metadata",
Summary: fmt.Sprintf("Could not restore desired node metadata for %s", node),
Details: map[string]any{
"node": node,
"error": err.Error(),
},
}
}

View File

@ -0,0 +1,254 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"
"metis/pkg/sentinel"
)
func TestStageDesiredNodeMetadataMergesInventoryAndLiveCluster(t *testing.T) {
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi5",
"rack": "a1",
"maintenance.bstein.dev/color": "blue",
"kubernetes.io/arch": "arm64",
"node-role.kubernetes.io/worker": "true",
},
"annotations": map[string]string{
"maintenance.bstein.dev/owner": "atlas",
"volumes.kubernetes.io/controller-managed-attach-detach": "true",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "recovery", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.inventory.Nodes[0].Labels = map[string]string{"hardware": "rpi4", "rack": "a1"}
app.inventory.Nodes[0].Taints = []string{"flash=true:NoSchedule"}
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Annotations: map[string]string{"maintenance.bstein.dev/legacy": "keep"},
}
desired, err := app.stageDesiredNodeMetadata("titan-15")
if err != nil {
t.Fatalf("stageDesiredNodeMetadata: %v", err)
}
if desired.Hostname != "titan-15" || !desired.Unschedulable {
t.Fatalf("unexpected desired metadata header: %#v", desired)
}
if desired.Labels["hardware"] != "rpi5" || desired.Labels["rack"] != "a1" || desired.Labels["maintenance.bstein.dev/color"] != "blue" {
t.Fatalf("unexpected desired labels: %#v", desired.Labels)
}
if _, ok := desired.Labels["kubernetes.io/arch"]; ok {
t.Fatalf("system labels should not be persisted: %#v", desired.Labels)
}
if desired.Annotations["maintenance.bstein.dev/owner"] != "atlas" || desired.Annotations["maintenance.bstein.dev/legacy"] != "keep" {
t.Fatalf("unexpected desired annotations: %#v", desired.Annotations)
}
if _, ok := desired.Annotations["volumes.kubernetes.io/controller-managed-attach-detach"]; ok {
t.Fatalf("controller annotations should not be persisted: %#v", desired.Annotations)
}
if !reflect.DeepEqual(desired.Taints, []string{"dedicated=recovery:NoSchedule"}) {
t.Fatalf("unexpected desired taints: %#v", desired.Taints)
}
data, err := os.ReadFile(app.settings.DesiredMetadataPath)
if err != nil {
t.Fatalf("read desired metadata file: %v", err)
}
if !strings.Contains(string(data), "titan-15") {
t.Fatalf("desired metadata file missing titan-15: %s", string(data))
}
}
func TestStoreSnapshotRestoresDesiredNodeMetadata(t *testing.T) {
var patchBody map[string]any
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi4",
"maintenance.bstein.dev/old": "1",
},
"annotations": map[string]string{
"maintenance.bstein.dev/mode": "old",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "old", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/nodes/titan-15":
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
t.Fatalf("decode patch: %v", err)
}
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Hostname: "titan-15",
Labels: map[string]string{"hardware": "rpi5"},
Annotations: map[string]string{"maintenance.bstein.dev/mode": "recovery"},
Taints: []string{"dedicated=recovery:NoSchedule"},
Unschedulable: false,
}
if err := app.StoreSnapshot(SnapshotRecord{
Node: "titan-15",
CollectedAt: time.Date(2026, 4, 24, 6, 0, 0, 0, time.UTC),
Snapshot: sentinel.Snapshot{Hostname: "titan-15"},
}); err != nil {
t.Fatalf("StoreSnapshot: %v", err)
}
if patchBody == nil {
t.Fatal("expected desired metadata patch")
}
metadata := patchBody["metadata"].(map[string]any)
labels := metadata["labels"].(map[string]any)
if labels["hardware"] != "rpi5" || labels["maintenance.bstein.dev/old"] != nil {
t.Fatalf("unexpected label patch: %#v", labels)
}
annotations := metadata["annotations"].(map[string]any)
if annotations["maintenance.bstein.dev/mode"] != "recovery" {
t.Fatalf("unexpected annotation patch: %#v", annotations)
}
spec := patchBody["spec"].(map[string]any)
if spec["unschedulable"] != false {
t.Fatalf("unexpected spec patch: %#v", spec)
}
taints := spec["taints"].([]any)
if len(taints) != 2 {
t.Fatalf("unexpected taint payload: %#v", taints)
}
entries := map[string]map[string]any{}
for _, raw := range taints {
entry := raw.(map[string]any)
key := entry["key"].(string)
entries[key] = entry
}
if entries["dedicated"]["value"] != "recovery" || entries["dedicated"]["effect"] != "NoSchedule" {
t.Fatalf("missing desired taint replacement: %#v", entries)
}
if entries["node.kubernetes.io/unreachable"]["effect"] != "NoExecute" {
t.Fatalf("system taint should be preserved: %#v", entries)
}
}
func TestDesiredNodeMetadataHelpers(t *testing.T) {
app := newTestApp(t)
if _, ok := app.desiredMetadataForNode("missing"); ok {
t.Fatal("expected no desired metadata for missing node")
}
if err := app.syncDesiredNodeMetadata(SnapshotRecord{Node: "missing"}); err != nil {
t.Fatalf("syncDesiredNodeMetadata missing should noop: %v", err)
}
if _, ok := liveClusterNode(""); ok {
t.Fatal("empty liveClusterNode lookup should fail")
}
if !isRestorableLabel("maintenance.bstein.dev/role") || isRestorableLabel("kubernetes.io/arch") {
t.Fatal("unexpected label restoration filter")
}
if !isRestorableAnnotation("maintenance.bstein.dev/state") || isRestorableAnnotation("volumes.kubernetes.io/foo") {
t.Fatal("unexpected annotation restoration filter")
}
if !isRestorableTaint("dedicated=recovery:NoSchedule") || isRestorableTaint("node.kubernetes.io/not-ready:NoExecute") {
t.Fatal("unexpected taint restoration filter")
}
key, value, effect := splitTaint("dedicated=recovery:NoSchedule")
if key != "dedicated" || value != "recovery" || effect != "NoSchedule" {
t.Fatalf("splitTaint mismatch: %q %q %q", key, value, effect)
}
if key, value, effect := splitTaint("just-a-key"); key != "just-a-key" || value != "" || effect != "" {
t.Fatalf("splitTaint key-only mismatch: %q %q %q", key, value, effect)
}
labels := filteredRestorableLabels(map[string]string{"hardware": "rpi5", "kubernetes.io/arch": "arm64"})
if !reflect.DeepEqual(labels, map[string]string{"hardware": "rpi5"}) {
t.Fatalf("filteredRestorableLabels = %#v", labels)
}
annotations := filteredRestorableAnnotations(map[string]string{"maintenance.bstein.dev/state": "ok", "volumes.kubernetes.io/foo": "bar"})
if !reflect.DeepEqual(annotations, map[string]string{"maintenance.bstein.dev/state": "ok"}) {
t.Fatalf("filteredRestorableAnnotations = %#v", annotations)
}
patch := metadataStringPatch(
map[string]string{"hardware": "rpi4", "maintenance.bstein.dev/old": "1"},
map[string]string{"hardware": "rpi5"},
isRestorableLabel,
)
if patch["hardware"] != "rpi5" || patch["maintenance.bstein.dev/old"] != nil {
t.Fatalf("metadataStringPatch = %#v", patch)
}
mergedTaints := mergeLiveAndDesiredTaints(
[]string{"node.kubernetes.io/unreachable:NoExecute", "dedicated=old:NoSchedule"},
[]string{"dedicated=new:NoSchedule", "dedicated=new:NoSchedule"},
)
if !reflect.DeepEqual(mergedTaints, []string{"dedicated=new:NoSchedule", "node.kubernetes.io/unreachable:NoExecute"}) {
t.Fatalf("mergeLiveAndDesiredTaints = %#v", mergedTaints)
}
payload := taintPatchPayload([]string{"dedicated=new:NoSchedule"})
if len(payload) != 1 || payload[0]["key"] != "dedicated" || payload[0]["value"] != "new" || payload[0]["effect"] != "NoSchedule" {
t.Fatalf("taintPatchPayload = %#v", payload)
}
original := DesiredNodeMetadata{Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}
cloned := cloneDesiredNodeMetadata(original)
cloned.Labels["hardware"] = "mutated"
cloned.Taints[0] = "changed"
if original.Labels["hardware"] != "rpi5" || original.Taints[0] != "dedicated=new:NoSchedule" {
t.Fatalf("cloneDesiredNodeMetadata should deep-copy slices/maps: %#v %#v", original, cloned)
}
if err := patchDesiredNodeMetadata(
clusterNode{Name: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
DesiredNodeMetadata{Node: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
); err != nil {
t.Fatalf("patchDesiredNodeMetadata should noop when already in sync: %v", err)
}
if event := desiredNodeMetadataSyncEvent("titan-15", os.ErrPermission); event.Kind != "sentinel.node-metadata" || event.Details["node"] != "titan-15" {
t.Fatalf("desiredNodeMetadataSyncEvent = %#v", event)
}
}

View File

@ -74,12 +74,17 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
} }
func (a *App) runBuild(job *Job, flash bool) { func (a *App) runBuild(job *Job, flash bool) {
_, class, err := a.inventory.FindNode(job.Node) nodeSpec, class, err := a.inventory.FindNode(job.Node)
if err != nil { if err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
return return
} }
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
return
}
if err := a.ensureHarborProject(); err != nil { if err := a.ensureHarborProject(); err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
@ -112,7 +117,8 @@ func (a *App) runBuild(job *Job, flash bool) {
return return
} }
buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano()) buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano())
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag)) job.Builder = builder.Name
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, strings.TrimSpace(nodeSpec.Hostname), artifactRef, buildTag))
if err != nil { if err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
@ -183,6 +189,9 @@ func (a *App) runFlash(job *Job) {
} }
func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) { func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
return RemoteFlashResult{}, err
}
a.setJob(job.ID, func(j *Job) { a.setJob(job.ID, func(j *Job) {
j.Status = JobRunning j.Status = JobRunning
j.Stage = "preflight" j.Stage = "preflight"

View File

@ -1,6 +1,7 @@
package service package service
import ( import (
"encoding/json"
"fmt" "fmt"
"math" "math"
"path/filepath" "path/filepath"
@ -246,8 +247,9 @@ func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any {
} }
} }
func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any { func (a *App) remoteBuildPodSpec(name, host, image, node, nodeHostname, artifactRef, buildTag string) map[string]any {
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name) workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
desiredEnv := remoteDesiredMetadataEnv(a, node)
return map[string]any{ return map[string]any{
"apiVersion": "v1", "apiVersion": "v1",
"kind": "Pod", "kind": "Pod",
@ -255,7 +257,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
"name": name, "name": name,
"namespace": a.settings.Namespace, "namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}, "labels": map[string]string{"app": "metis-remote", "metis-run": "build"},
"annotations": vaultRuntimeAnnotations(true), "annotations": vaultRuntimeAnnotations(true, nodeHostname),
}, },
"spec": map[string]any{ "spec": map[string]any{
"restartPolicy": "Never", "restartPolicy": "Never",
@ -283,6 +285,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
), ),
}, },
"securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0}, "securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0},
"env": desiredEnv,
"envFrom": []map[string]any{ "envFrom": []map[string]any{
{"configMapRef": map[string]any{"name": "metis"}}, {"configMapRef": map[string]any{"name": "metis"}},
}, },
@ -309,7 +312,7 @@ func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef st
"name": name, "name": name,
"namespace": a.settings.Namespace, "namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "flash"}, "labels": map[string]string{"app": "metis-remote", "metis-run": "flash"},
"annotations": vaultRuntimeAnnotations(false), "annotations": vaultRuntimeAnnotations(false, ""),
}, },
"spec": map[string]any{ "spec": map[string]any{
"restartPolicy": "Never", "restartPolicy": "Never",
@ -378,7 +381,46 @@ func mountedHostTmpDir(path string) string {
return "/host-tmp" return "/host-tmp"
} }
func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string { func remoteDesiredMetadataEnv(a *App, node string) []map[string]any {
desired, ok := a.desiredMetadataForNode(node)
if !ok {
return nil
}
labelsJSON, _ := jsonMarshalStringMap(desired.Labels)
taintsJSON, _ := jsonMarshalStringSlice(desired.Taints)
env := []map[string]any{}
if labelsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_LABELS_JSON", "value": labelsJSON})
}
if taintsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_TAINTS_JSON", "value": taintsJSON})
}
return env
}
func jsonMarshalStringMap(values map[string]string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func jsonMarshalStringSlice(values []string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func vaultRuntimeAnnotations(includeSSHKeys bool, nodeHostname string) map[string]string {
annotations := map[string]string{ annotations := map[string]string{
"vault.hashicorp.com/agent-inject": "true", "vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/agent-pre-populate-only": "true", "vault.hashicorp.com/agent-pre-populate-only": "true",
@ -399,6 +441,19 @@ export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}" export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}" export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}" export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
{{ end }}`
}
nodeHostname = strings.TrimSpace(nodeHostname)
if nodeHostname != "" {
secretPath := fmt.Sprintf("secret/data/nodes/%s", nodeHostname)
annotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] = secretPath
annotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"] = `{{ with secret "` + secretPath + `" }}
export METIS_NODE_SSH_PASSWORD="{{ .Data.data.ssh_password }}"
export METIS_NODE_SSH_PASSWORD_HASH="{{ .Data.data.ssh_password_hash }}"
export METIS_NODE_ATLAS_PASSWORD="{{ .Data.data.atlas_password }}"
export METIS_NODE_ATLAS_PASSWORD_HASH="{{ .Data.data.atlas_password_hash }}"
export METIS_NODE_ROOT_PASSWORD="{{ .Data.data.root_password }}"
export METIS_NODE_ROOT_PASSWORD_HASH="{{ .Data.data.root_password_hash }}"
{{ end }}` {{ end }}`
} }
return annotations return annotations
@ -413,6 +468,7 @@ func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string {
if includeSSHKeys { if includeSSHKeys {
lines = append(lines, ". /vault/secrets/metis-ssh-env.sh") lines = append(lines, ". /vault/secrets/metis-ssh-env.sh")
} }
lines = append(lines, "if [ -f /vault/secrets/metis-node-secrets-env.sh ]; then . /vault/secrets/metis-node-secrets-env.sh; fi")
lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...)) lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...))
return strings.Join(lines, "\n") return strings.Join(lines, "\n")
} }

View File

@ -251,8 +251,13 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
app := newTestApp(t) app := newTestApp(t)
app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace" app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace"
app.settings.HostTmpDir = "/var/tmp/metis-flash-test" app.settings.HostTmpDir = "/var/tmp/metis-flash-test"
app.desiredMetadata["titan-10"] = DesiredNodeMetadata{
Node: "titan-10",
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
}
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "registry.example/metis/titan-10", "build-1") buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "titan-10", "registry.example/metis/titan-10", "build-1")
buildBody := buildSpec["spec"].(map[string]any) buildBody := buildSpec["spec"].(map[string]any)
buildVolumes := buildBody["volumes"].([]map[string]any) buildVolumes := buildBody["volumes"].([]map[string]any)
workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any) workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any)
@ -260,6 +265,17 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
t.Fatalf("build workspace hostPath = %v", got) t.Fatalf("build workspace hostPath = %v", got)
} }
buildContainer := buildBody["containers"].([]map[string]any)[0] buildContainer := buildBody["containers"].([]map[string]any)[0]
buildEnv := buildContainer["env"].([]map[string]any)
if len(buildEnv) != 2 {
t.Fatalf("expected desired metadata env, got %#v", buildEnv)
}
metadataAnnotations := buildSpec["metadata"].(map[string]any)["annotations"].(map[string]string)
if metadataAnnotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] != "secret/data/nodes/titan-10" {
t.Fatalf("unexpected node secret annotation: %#v", metadataAnnotations)
}
if !strings.Contains(metadataAnnotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"], "METIS_NODE_ROOT_PASSWORD") {
t.Fatalf("expected node password exports in vault template: %#v", metadataAnnotations)
}
buildSecurity := buildContainer["securityContext"].(map[string]any) buildSecurity := buildContainer["securityContext"].(map[string]any)
if got := buildSecurity["runAsUser"]; got != 0 { if got := buildSecurity["runAsUser"]; got != 0 {
t.Fatalf("build runAsUser = %v", got) t.Fatalf("build runAsUser = %v", got)

View File

@ -15,7 +15,7 @@ func TestMountedHostTmpDirMapsConfiguredTmpPathIntoMount(t *testing.T) {
} }
func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) { func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
withKeys := vaultRuntimeAnnotations(true) withKeys := vaultRuntimeAnnotations(true, "titan-15")
template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"] template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]
if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") { if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") {
t.Fatalf("expected tethys hecate key export in vault template: %q", template) t.Fatalf("expected tethys hecate key export in vault template: %q", template)
@ -24,7 +24,7 @@ func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
t.Fatalf("expected db hecate key export in vault template: %q", template) t.Fatalf("expected db hecate key export in vault template: %q", template)
} }
withoutKeys := vaultRuntimeAnnotations(false) withoutKeys := vaultRuntimeAnnotations(false, "")
if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok { if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok {
t.Fatalf("did not expect ssh key template when includeSSHKeys=false") t.Fatalf("did not expect ssh key template when includeSSHKeys=false")
} }

View File

@ -19,6 +19,7 @@ type Settings struct {
HistoryPath string HistoryPath string
SnapshotsPath string SnapshotsPath string
TargetsPath string TargetsPath string
DesiredMetadataPath string
DefaultFlashHost string DefaultFlashHost string
FlashHosts []string FlashHosts []string
LocalHost string LocalHost string
@ -52,6 +53,7 @@ func FromEnv() Settings {
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
DesiredMetadataPath: getenvDefault("METIS_DESIRED_METADATA_PATH", filepath.Join(dataDir, "desired-node-metadata.json")),
DefaultFlashHost: defaultFlashHost, DefaultFlashHost: defaultFlashHost,
FlashHosts: flashHosts, FlashHosts: flashHosts,
LocalHost: localHost, LocalHost: localHost,