recovery(metis): restore node identity on rebuilt images

This commit is contained in:
codex 2026-04-24 16:57:34 -03:00
parent ebaa367efd
commit 17069e4677
19 changed files with 1612 additions and 144 deletions

View File

@ -154,6 +154,10 @@ if [ -s "${sudoers_file}" ]; then
fi fi
fi fi
if [ -x /usr/local/sbin/metis-apply-node-identity.sh ]; then
/usr/local/sbin/metis-apply-node-identity.sh || true
fi
rm -f /root/.not_logged_in_yet rm -f /root/.not_logged_in_yet
if ! command -v k3s >/dev/null 2>&1; then if ! command -v k3s >/dev/null 2>&1; then

View File

@ -2,8 +2,6 @@ package plan
import ( import (
"bytes" "bytes"
"context"
"encoding/json"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
@ -64,6 +62,7 @@ func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error)
cfg.Secrets = sec.Extra cfg.Secrets = sec.Extra
} }
} }
applyNodeMetadataEnv(cfg)
files, err := buildFiles(cfg, sec) files, err := buildFiles(cfg, sec)
if err != nil { if err != nil {
return nil, err return nil, err
@ -111,7 +110,9 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
{Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true}, {Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true},
{Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true}, {Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true},
{Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true}, {Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true},
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true}, {Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg, sec)), Mode: 0o600, RootFS: true},
{Path: "usr/local/sbin/metis-apply-node-identity.sh", Content: []byte(nodeIdentityScriptContent()), Mode: 0o755, RootFS: true},
{Path: "etc/cloud/cloud.cfg.d/90-metis-recovery.cfg", Content: []byte(cloudInitRootFSContent(sec)), Mode: 0o644, RootFS: true},
} }
if cfg.IP != "" { if cfg.IP != "" {
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -148,6 +149,14 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true, RootFS: true,
}) })
} }
if passwordAuth := sshPasswordConfigContent(sec); passwordAuth != "" {
files = append(files, inject.FileSpec{
Path: "etc/ssh/sshd_config.d/90-metis-password-auth.conf",
Content: []byte(passwordAuth),
Mode: 0o644,
RootFS: true,
})
}
if cfg.SSHUser == "atlas" { if cfg.SSHUser == "atlas" {
sudoers := hecateSudoersContent(cfg.SSHUser) sudoers := hecateSudoersContent(cfg.SSHUser)
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -172,8 +181,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
}) })
} }
// Store the raw config for debugging/ops. raw, err := jsonMarshalIndent(cfg)
raw, err := json.MarshalIndent(cfg, "", " ")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -184,7 +192,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true, RootFS: true,
}) })
if sec != nil { if sec != nil {
secRaw, err := json.MarshalIndent(sec, "", " ") secRaw, err := jsonMarshalIndent(redactedSecretsForImage(sec))
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -196,7 +204,6 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
}) })
} }
// Optional cloud-init for images that honor NoCloud.
userData := cloudInitUserData(cfg, sec) userData := cloudInitUserData(cfg, sec)
if userData != "" { if userData != "" {
files = append(files, inject.FileSpec{ files = append(files, inject.FileSpec{
@ -267,33 +274,6 @@ func allowK3sNodeLabel(role, key string) bool {
return !strings.HasPrefix(key, "node-role.kubernetes.io/") return !strings.HasPrefix(key, "node-role.kubernetes.io/")
} }
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
return b.String()
}
func networkManagerConnectionContent(id, iface, ip string) string { func networkManagerConnectionContent(id, iface, ip string) string {
gateway := ip gateway := ip
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 { if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
@ -347,7 +327,6 @@ func fstabAppendContent(cfg *config.NodeConfig) string {
source := entry.Source source := entry.Source
switch { switch {
case source != "": case source != "":
// Use the explicit source path for bind mounts.
case entry.UUID != "": case entry.UUID != "":
source = "UUID=" + entry.UUID source = "UUID=" + entry.UUID
case entry.Label != "": case entry.Label != "":
@ -374,25 +353,6 @@ func hecateSudoersContent(user string) string {
) )
} }
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}
func fetchSecrets(hostname string) *secrets.NodeSecrets {
if os.Getenv("VAULT_ADDR") == "" {
return nil
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return nil
}
return sec
}
func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) { func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) {
var files []inject.FileSpec var files []inject.FileSpec
if class == nil { if class == nil {

View File

@ -174,3 +174,80 @@ func TestBuildFilesAddsHecateSudoersForAtlas(t *testing.T) {
t.Fatalf("metis sudoers backup missing/incorrect: %s", backup) t.Fatalf("metis sudoers backup missing/incorrect: %s", backup)
} }
} }
func TestBuildFilesAddsPasswordArtifactsAndRedactsSecrets(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
IP: "192.168.22.43",
SSHUser: "atlas",
SSHKeys: []string{"ssh-ed25519 AAA test"},
K3s: config.K3sConfig{
Role: "agent",
Version: "v1.31.5+k3s1",
},
}
sec := &secrets.NodeSecrets{
SSHPassword: "atlas-pass",
RootPassword: "root-pass",
K3sToken: "super-secret-token",
Extra: map[string]string{"api_key": "secret"},
}
files, err := buildFiles(cfg, sec)
if err != nil {
t.Fatalf("buildFiles: %v", err)
}
pathMap := map[string]string{}
for _, file := range files {
pathMap[file.Path] = string(file.Content)
}
firstboot := pathMap["etc/metis/firstboot.env"]
if !strings.Contains(firstboot, "METIS_ATLAS_PASSWORD='atlas-pass'") || !strings.Contains(firstboot, "METIS_ROOT_PASSWORD='root-pass'") {
t.Fatalf("firstboot env missing password material: %s", firstboot)
}
if sshd := pathMap["etc/ssh/sshd_config.d/90-metis-password-auth.conf"]; !strings.Contains(sshd, "PasswordAuthentication yes") || !strings.Contains(sshd, "PermitRootLogin yes") {
t.Fatalf("password auth config missing: %s", sshd)
}
if script := pathMap["usr/local/sbin/metis-apply-node-identity.sh"]; !strings.Contains(script, "apply_password root") || !strings.Contains(script, "METIS_ATLAS_PASSWORD") {
t.Fatalf("node identity script missing password application: %s", script)
}
if cloudCfg := pathMap["etc/cloud/cloud.cfg.d/90-metis-recovery.cfg"]; !strings.Contains(cloudCfg, "ssh_pwauth: true") {
t.Fatalf("cloud recovery config missing ssh_pwauth: %s", cloudCfg)
}
if userData := pathMap["user-data"]; !strings.Contains(userData, "ssh_pwauth: true") || !strings.Contains(userData, "metis-apply-node-identity.sh") {
t.Fatalf("cloud-init user-data missing recovery hooks: %s", userData)
}
secretsJSON := pathMap["etc/metis/secrets.json"]
if strings.Contains(secretsJSON, "atlas-pass") || strings.Contains(secretsJSON, "root-pass") || strings.Contains(secretsJSON, "super-secret-token") {
t.Fatalf("secrets.json should be redacted: %s", secretsJSON)
}
if !strings.Contains(secretsJSON, `"has_ssh_password": true`) || !strings.Contains(secretsJSON, `"extra_keys": [`) {
t.Fatalf("secrets.json should keep redacted debug metadata: %s", secretsJSON)
}
}
func TestApplyNodeMetadataEnv(t *testing.T) {
cfg := &config.NodeConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
K3s: config.K3sConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
},
}
t.Setenv("METIS_NODE_LABELS_JSON", `{"hardware":"rpi5","maintenance.bstein.dev/role":"recovery"}`)
t.Setenv("METIS_NODE_TAINTS_JSON", `["dedicated=recovery:NoSchedule","flash=true:NoSchedule"]`)
applyNodeMetadataEnv(cfg)
if cfg.Labels["hardware"] != "rpi5" || cfg.Labels["maintenance.bstein.dev/role"] != "recovery" {
t.Fatalf("applyNodeMetadataEnv labels = %#v", cfg.Labels)
}
if !strings.Contains(strings.Join(cfg.Taints, ","), "dedicated=recovery:NoSchedule") {
t.Fatalf("applyNodeMetadataEnv taints = %#v", cfg.Taints)
}
cfg = &config.NodeConfig{}
t.Setenv("METIS_NODE_LABELS_JSON", `{bad-json`)
t.Setenv("METIS_NODE_TAINTS_JSON", `{bad-json`)
applyNodeMetadataEnv(cfg)
if cfg.Labels != nil || cfg.Taints != nil {
t.Fatalf("invalid env JSON should be ignored: %#v", cfg)
}
}

262
pkg/plan/node_identity.go Normal file
View File

@ -0,0 +1,262 @@
package plan
import (
"bytes"
"fmt"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString("METIS_ATLAS_USER='atlas'\n")
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
if sec != nil {
if value := effectiveAtlasPassword(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD=%s\n", shellQuote(value)))
}
if value := effectiveAtlasPasswordHash(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD_HASH=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPassword); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPasswordHash); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD_HASH=%s\n", shellQuote(value)))
}
}
return b.String()
}
func cloudInitRootFSContent(sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString("#cloud-config\n")
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func nodeIdentityScriptContent() string {
return `#!/usr/bin/env bash
set -euo pipefail
marker="/var/lib/metis/node-identity-applied.done"
env_file="/etc/metis/firstboot.env"
key_file="/etc/metis/authorized_keys"
sudoers_file="/etc/metis/sudoers-hecate"
default_groups=(adm sudo tty disk dialout audio video plugdev games users systemd-journal input render netdev)
if [ -f "${marker}" ]; then
exit 0
fi
mkdir -p /var/lib/metis
if [ -f "${env_file}" ]; then
# shellcheck disable=SC1090
. "${env_file}"
fi
atlas_user="${METIS_ATLAS_USER:-atlas}"
ssh_user="${METIS_SSH_USER:-${atlas_user}}"
atlas_password="${METIS_ATLAS_PASSWORD:-}"
atlas_password_hash="${METIS_ATLAS_PASSWORD_HASH:-}"
root_password="${METIS_ROOT_PASSWORD:-}"
root_password_hash="${METIS_ROOT_PASSWORD_HASH:-}"
group_list=()
for group_name in "${default_groups[@]}"; do
if getent group "${group_name}" >/dev/null 2>&1; then
group_list+=("${group_name}")
fi
done
if [ "${#group_list[@]}" -gt 0 ]; then
group_csv="$(IFS=,; printf '%s' "${group_list[*]}")"
else
group_csv=""
fi
ensure_user() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
if ! id "${user_name}" >/dev/null 2>&1; then
if [ -n "${group_csv}" ]; then
useradd -m -s /bin/bash -G "${group_csv}" "${user_name}"
else
useradd -m -s /bin/bash "${user_name}"
fi
elif [ -n "${group_csv}" ]; then
usermod -a -G "${group_csv}" "${user_name}" || true
fi
}
apply_password() {
local user_name="$1"
local plain_password="$2"
local hash_password="$3"
if ! id "${user_name}" >/dev/null 2>&1; then
return 0
fi
if [ -n "${hash_password}" ]; then
usermod -p "${hash_password}" "${user_name}"
passwd -u "${user_name}" >/dev/null 2>&1 || true
return 0
fi
if [ -n "${plain_password}" ]; then
printf '%s:%s\n' "${user_name}" "${plain_password}" | chpasswd
passwd -u "${user_name}" >/dev/null 2>&1 || true
fi
}
install_keys() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
[ -s "${key_file}" ] || return 0
local home_dir
home_dir="$(getent passwd "${user_name}" | cut -d: -f6)"
if [ -z "${home_dir}" ]; then
if [ "${user_name}" = "root" ]; then
home_dir="/root"
else
home_dir="/home/${user_name}"
fi
fi
install -d -m 700 "${home_dir}/.ssh"
install -m 600 "${key_file}" "${home_dir}/.ssh/authorized_keys"
chown -R "${user_name}:${user_name}" "${home_dir}/.ssh" 2>/dev/null || true
}
ensure_user "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
ensure_user "${ssh_user}"
fi
apply_password root "${root_password}" "${root_password_hash}"
apply_password "${atlas_user}" "${atlas_password}" "${atlas_password_hash}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
apply_password "${ssh_user}" "${atlas_password}" "${atlas_password_hash}"
fi
if [ -s "${key_file}" ]; then
install_keys root
install_keys "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
install_keys "${ssh_user}"
fi
fi
if [ -s "${sudoers_file}" ]; then
install -d -m 755 /etc/sudoers.d
install -m 440 "${sudoers_file}" /etc/sudoers.d/90-hecate-atlas
if command -v visudo >/dev/null 2>&1; then
visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null 2>&1 || rm -f /etc/sudoers.d/90-hecate-atlas
fi
fi
systemctl restart ssh.service >/dev/null 2>&1 || systemctl restart sshd.service >/dev/null 2>&1 || systemctl restart ssh.socket >/dev/null 2>&1 || true
touch "${marker}"
`
}
func sshPasswordConfigContent(sec *secrets.NodeSecrets) string {
if !hasNodePasswords(sec) {
return ""
}
return "PasswordAuthentication yes\nKbdInteractiveAuthentication no\nChallengeResponseAuthentication no\nPermitRootLogin yes\nUsePAM yes\n"
}
func hasNodePasswords(sec *secrets.NodeSecrets) bool {
if sec == nil {
return false
}
return effectiveAtlasPassword(sec) != "" || effectiveAtlasPasswordHash(sec) != "" || firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != ""
}
func effectiveAtlasPassword(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPassword, sec.SSHPassword)
}
func effectiveAtlasPasswordHash(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPasswordHash, sec.SSHPasswordHash)
}
func firstNonEmptyString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
func redactedSecretsForImage(sec *secrets.NodeSecrets) map[string]any {
if sec == nil {
return nil
}
debug := map[string]any{
"has_ssh_password": firstNonEmptyString(sec.SSHPassword, sec.SSHPasswordHash) != "",
"has_atlas_password": firstNonEmptyString(sec.AtlasPassword, sec.AtlasPasswordHash) != "",
"has_root_password": firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "",
"has_k3s_token": strings.TrimSpace(sec.K3sToken) != "",
"has_cloud_init_override": strings.TrimSpace(sec.CloudInit) != "",
}
if len(sec.Extra) > 0 {
keys := make([]string, 0, len(sec.Extra))
for key := range sec.Extra {
key = strings.TrimSpace(key)
if key == "" {
continue
}
keys = append(keys, key)
}
sort.Strings(keys)
debug["extra_keys"] = keys
}
return debug
}
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}

133
pkg/plan/node_metadata.go Normal file
View File

@ -0,0 +1,133 @@
package plan
import (
"context"
"encoding/json"
"os"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func fetchSecrets(hostname string) *secrets.NodeSecrets {
envSecrets := nodeSecretsFromEnv()
if os.Getenv("VAULT_ADDR") == "" {
return envSecrets
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return envSecrets
}
return mergeNodeSecrets(sec, envSecrets)
}
func nodeSecretsFromEnv() *secrets.NodeSecrets {
sec := &secrets.NodeSecrets{
SSHPassword: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD")),
SSHPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD_HASH")),
AtlasPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD")),
AtlasPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD_HASH")),
RootPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD")),
RootPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD_HASH")),
}
if sec.SSHPassword == "" && sec.SSHPasswordHash == "" && sec.AtlasPassword == "" && sec.AtlasPasswordHash == "" && sec.RootPassword == "" && sec.RootPasswordHash == "" {
return nil
}
return sec
}
func mergeNodeSecrets(base, override *secrets.NodeSecrets) *secrets.NodeSecrets {
if base == nil {
return override
}
if override == nil {
return base
}
merged := *base
merged.SSHPassword = firstNonEmptyString(override.SSHPassword, base.SSHPassword)
merged.SSHPasswordHash = firstNonEmptyString(override.SSHPasswordHash, base.SSHPasswordHash)
merged.AtlasPassword = firstNonEmptyString(override.AtlasPassword, base.AtlasPassword)
merged.AtlasPasswordHash = firstNonEmptyString(override.AtlasPasswordHash, base.AtlasPasswordHash)
merged.RootPassword = firstNonEmptyString(override.RootPassword, base.RootPassword)
merged.RootPasswordHash = firstNonEmptyString(override.RootPasswordHash, base.RootPasswordHash)
merged.K3sToken = firstNonEmptyString(override.K3sToken, base.K3sToken)
merged.CloudInit = firstNonEmptyString(override.CloudInit, base.CloudInit)
if len(base.Extra) > 0 || len(override.Extra) > 0 {
merged.Extra = map[string]string{}
for key, value := range base.Extra {
merged.Extra[key] = value
}
for key, value := range override.Extra {
merged.Extra[key] = value
}
}
return &merged
}
func applyNodeMetadataEnv(cfg *config.NodeConfig) {
if cfg == nil {
return
}
if labels := parseEnvJSONMap(os.Getenv("METIS_NODE_LABELS_JSON")); len(labels) > 0 {
if cfg.Labels == nil {
cfg.Labels = map[string]string{}
}
for key, value := range labels {
cfg.Labels[key] = value
}
cfg.K3s.Labels = cfg.Labels
}
if taints := parseEnvJSONList(os.Getenv("METIS_NODE_TAINTS_JSON")); len(taints) > 0 {
cfg.Taints = uniqueStrings(append(cfg.Taints, taints...))
cfg.K3s.Taints = append([]string{}, cfg.Taints...)
}
}
func parseEnvJSONMap(raw string) map[string]string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values map[string]string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func parseEnvJSONList(raw string) []string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values []string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func uniqueStrings(values []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
return out
}
func jsonMarshalIndent(value any) ([]byte, error) {
return json.MarshalIndent(value, "", " ")
}

View File

@ -0,0 +1,127 @@
package plan
import (
"reflect"
"strings"
"testing"
"metis/pkg/config"
"metis/pkg/secrets"
)
func TestNodeSecretHelpers(t *testing.T) {
if got := effectiveAtlasPassword(nil); got != "" {
t.Fatalf("effectiveAtlasPassword(nil) = %q", got)
}
if got := effectiveAtlasPasswordHash(nil); got != "" {
t.Fatalf("effectiveAtlasPasswordHash(nil) = %q", got)
}
sec := &secrets.NodeSecrets{SSHPassword: "ssh-pass", SSHPasswordHash: "$ssh$hash"}
if got := effectiveAtlasPassword(sec); got != "ssh-pass" {
t.Fatalf("effectiveAtlasPassword fallback = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$ssh$hash" {
t.Fatalf("effectiveAtlasPasswordHash fallback = %q", got)
}
sec.AtlasPassword = "atlas-pass"
sec.AtlasPasswordHash = "$atlas$hash"
if got := effectiveAtlasPassword(sec); got != "atlas-pass" {
t.Fatalf("effectiveAtlasPassword explicit = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$atlas$hash" {
t.Fatalf("effectiveAtlasPasswordHash explicit = %q", got)
}
if got := firstNonEmptyString("", " value ", "ignored"); got != "value" {
t.Fatalf("firstNonEmptyString = %q", got)
}
if !hasNodePasswords(&secrets.NodeSecrets{RootPasswordHash: "$root$hash"}) {
t.Fatal("expected root password hash to count as password material")
}
if hasNodePasswords(&secrets.NodeSecrets{}) {
t.Fatal("empty node secrets should not count as password material")
}
debug := redactedSecretsForImage(&secrets.NodeSecrets{Extra: map[string]string{"b": "2", "a": "1"}})
if !reflect.DeepEqual(debug["extra_keys"], []string{"a", "b"}) {
t.Fatalf("redactedSecretsForImage extra_keys = %#v", debug)
}
}
func TestNodeSecretsFromEnvAndMergeNodeSecrets(t *testing.T) {
t.Setenv("METIS_NODE_SSH_PASSWORD", "ssh-pass")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "$ssh$hash")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "atlas-pass")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "$atlas$hash")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "root-pass")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "$root$hash")
envSecrets := nodeSecretsFromEnv()
if envSecrets == nil || envSecrets.RootPassword != "root-pass" || envSecrets.AtlasPasswordHash != "$atlas$hash" {
t.Fatalf("nodeSecretsFromEnv = %#v", envSecrets)
}
merged := mergeNodeSecrets(&secrets.NodeSecrets{
SSHPassword: "base-ssh",
K3sToken: "base-token",
CloudInit: "base-cloud",
Extra: map[string]string{"base": "1"},
}, &secrets.NodeSecrets{
AtlasPassword: "override-atlas",
RootPassword: "override-root",
K3sToken: "override-token",
CloudInit: "override-cloud",
Extra: map[string]string{"override": "2"},
})
if merged.K3sToken != "override-token" || merged.CloudInit != "override-cloud" || merged.AtlasPassword != "override-atlas" || merged.RootPassword != "override-root" {
t.Fatalf("mergeNodeSecrets = %#v", merged)
}
if merged.Extra["base"] != "1" || merged.Extra["override"] != "2" {
t.Fatalf("mergeNodeSecrets extras = %#v", merged.Extra)
}
if got := mergeNodeSecrets(nil, envSecrets); got.RootPasswordHash != "$root$hash" {
t.Fatalf("mergeNodeSecrets nil base = %#v", got)
}
if got := mergeNodeSecrets(envSecrets, nil); got.SSHPassword != "ssh-pass" {
t.Fatalf("mergeNodeSecrets nil override = %#v", got)
}
t.Setenv("METIS_NODE_SSH_PASSWORD", "")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "")
if got := nodeSecretsFromEnv(); got != nil {
t.Fatalf("expected empty env secrets to collapse to nil, got %#v", got)
}
}
func TestFirstbootEnvContentIncludesHashes(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
SSHUser: "atlas",
K3s: config.K3sConfig{Version: "v1.31.5+k3s1"},
}
content := firstbootEnvContent(cfg, &secrets.NodeSecrets{
AtlasPasswordHash: "$atlas$hash",
RootPasswordHash: "$root$hash",
})
if !reflect.DeepEqual(parseEnvLines(content), map[string]string{
"METIS_HOSTNAME": "'titan-15'",
"METIS_SSH_USER": "'atlas'",
"METIS_ATLAS_USER": "'atlas'",
"METIS_K3S_VERSION": "'v1.31.5+k3s1'",
"METIS_ATLAS_PASSWORD_HASH": "'$atlas$hash'",
"METIS_ROOT_PASSWORD_HASH": "'$root$hash'",
}) {
t.Fatalf("firstbootEnvContent = %q", content)
}
}
func parseEnvLines(raw string) map[string]string {
result := map[string]string{}
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
continue
}
result[parts[0]] = parts[1]
}
return result
}

View File

@ -15,10 +15,15 @@ import (
// NodeSecrets holds per-node secret material to inject at burn time. // NodeSecrets holds per-node secret material to inject at burn time.
// These should live in Vault at secret/data/nodes/<hostname>. // These should live in Vault at secret/data/nodes/<hostname>.
type NodeSecrets struct { type NodeSecrets struct {
SSHPassword string `json:"ssh_password,omitempty"` SSHPassword string `json:"ssh_password,omitempty"`
K3sToken string `json:"k3s_token,omitempty"` SSHPasswordHash string `json:"ssh_password_hash,omitempty"`
CloudInit string `json:"cloud_init,omitempty"` AtlasPassword string `json:"atlas_password,omitempty"`
Extra map[string]string `json:"extra,omitempty"` AtlasPasswordHash string `json:"atlas_password_hash,omitempty"`
RootPassword string `json:"root_password,omitempty"`
RootPasswordHash string `json:"root_password_hash,omitempty"`
K3sToken string `json:"k3s_token,omitempty"`
CloudInit string `json:"cloud_init,omitempty"`
Extra map[string]string `json:"extra,omitempty"`
} }
// Client fetches node secrets from Vault using either a token or AppRole. // Client fetches node secrets from Vault using either a token or AppRole.

View File

@ -16,9 +16,11 @@ func TestFetchNodeReturnsData(t *testing.T) {
_ = json.NewEncoder(w).Encode(map[string]any{ _ = json.NewEncoder(w).Encode(map[string]any{
"data": map[string]any{ "data": map[string]any{
"data": map[string]any{ "data": map[string]any{
"ssh_password": "p1", "ssh_password": "p1",
"k3s_token": "t1", "atlas_password_hash": "$atlas$hash",
"cloud_init": "ci", "root_password": "root-pw",
"k3s_token": "t1",
"cloud_init": "ci",
}, },
}, },
}) })
@ -33,7 +35,7 @@ func TestFetchNodeReturnsData(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("fetch: %v", err) t.Fatalf("fetch: %v", err)
} }
if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" { if sec.SSHPassword != "p1" || sec.AtlasPasswordHash != "$atlas$hash" || sec.RootPassword != "root-pw" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
t.Fatalf("unexpected secrets: %+v", sec) t.Fatalf("unexpected secrets: %+v", sec)
} }
} }

View File

@ -117,16 +117,24 @@ type App struct {
inventory *inventory.Inventory inventory *inventory.Inventory
metrics *Metrics metrics *Metrics
mu sync.RWMutex mu sync.RWMutex
jobs map[string]*Job jobs map[string]*Job
snapshots map[string]SnapshotRecord snapshots map[string]SnapshotRecord
targets map[string]facts.Targets targets map[string]facts.Targets
artifactStore map[string]ArtifactSummary artifactStore map[string]ArtifactSummary
deviceStore map[string]deviceSnapshot deviceStore map[string]deviceSnapshot
desiredMetadata map[string]DesiredNodeMetadata
} }
// NewApp creates a Metis service app instance. // NewApp creates a Metis service app instance.
func NewApp(settings Settings) (*App, error) { func NewApp(settings Settings) (*App, error) {
if strings.TrimSpace(settings.DesiredMetadataPath) == "" {
baseDir := filepath.Dir(settings.SnapshotsPath)
if strings.TrimSpace(baseDir) == "" || baseDir == "." {
baseDir = filepath.Dir(settings.HistoryPath)
}
settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json")
}
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil { if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
return nil, err return nil, err
} }
@ -141,18 +149,20 @@ func NewApp(settings Settings) (*App, error) {
return nil, err return nil, err
} }
app := &App{ app := &App{
settings: settings, settings: settings,
inventory: inv, inventory: inv,
metrics: NewMetrics(), metrics: NewMetrics(),
jobs: map[string]*Job{}, jobs: map[string]*Job{},
snapshots: map[string]SnapshotRecord{}, snapshots: map[string]SnapshotRecord{},
targets: map[string]facts.Targets{}, targets: map[string]facts.Targets{},
artifactStore: map[string]ArtifactSummary{}, artifactStore: map[string]ArtifactSummary{},
deviceStore: map[string]deviceSnapshot{}, deviceStore: map[string]deviceSnapshot{},
desiredMetadata: map[string]DesiredNodeMetadata{},
} }
_ = app.loadSnapshots() _ = app.loadSnapshots()
_ = app.loadTargets() _ = app.loadTargets()
_ = app.loadArtifacts() _ = app.loadArtifacts()
_ = app.loadDesiredNodeMetadata()
return app, nil return app, nil
} }
@ -289,6 +299,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
if err := a.syncScratchAnnotations(record); err != nil { if err := a.syncScratchAnnotations(record); err != nil {
a.appendEvent(annotationSyncEvent(record.Node, err)) a.appendEvent(annotationSyncEvent(record.Node, err))
} }
if err := a.syncDesiredNodeMetadata(record); err != nil {
a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err))
}
a.appendEvent(Event{ a.appendEvent(Event{
Time: record.CollectedAt, Time: record.CollectedAt,
Kind: "sentinel.snapshot", Kind: "sentinel.snapshot",

View File

@ -22,6 +22,9 @@ type clusterNode struct {
Worker bool Worker bool
ControlPlane bool ControlPlane bool
Unschedulable bool Unschedulable bool
Labels map[string]string
Annotations map[string]string
Taints []string
USBScratchStatus string USBScratchStatus string
USBScratchManagedPaths string USBScratchManagedPaths string
} }
@ -179,6 +182,11 @@ func clusterNodes() []clusterNode {
} `json:"metadata"` } `json:"metadata"`
Spec struct { Spec struct {
Unschedulable bool `json:"unschedulable"` Unschedulable bool `json:"unschedulable"`
Taints []struct {
Key string `json:"key"`
Value string `json:"value"`
Effect string `json:"effect"`
} `json:"taints"`
} `json:"spec"` } `json:"spec"`
} `json:"items"` } `json:"items"`
} }
@ -189,6 +197,28 @@ func clusterNodes() []clusterNode {
for _, item := range payload.Items { for _, item := range payload.Items {
labels := item.Metadata.Labels labels := item.Metadata.Labels
annotations := item.Metadata.Annotations annotations := item.Metadata.Annotations
if labels == nil {
labels = map[string]string{}
}
if annotations == nil {
annotations = map[string]string{}
}
taints := make([]string, 0, len(item.Spec.Taints))
for _, taint := range item.Spec.Taints {
key := strings.TrimSpace(taint.Key)
if key == "" {
continue
}
raw := key
if value := strings.TrimSpace(taint.Value); value != "" {
raw += "=" + value
}
if effect := strings.TrimSpace(taint.Effect); effect != "" {
raw += ":" + effect
}
taints = append(taints, raw)
}
sort.Strings(taints)
nodes = append(nodes, clusterNode{ nodes = append(nodes, clusterNode{
Name: strings.TrimSpace(item.Metadata.Name), Name: strings.TrimSpace(item.Metadata.Name),
Arch: strings.TrimSpace(labels["kubernetes.io/arch"]), Arch: strings.TrimSpace(labels["kubernetes.io/arch"]),
@ -196,6 +226,9 @@ func clusterNodes() []clusterNode {
Worker: labels["node-role.kubernetes.io/worker"] == "true", Worker: labels["node-role.kubernetes.io/worker"] == "true",
ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "", ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "",
Unschedulable: item.Spec.Unschedulable, Unschedulable: item.Spec.Unschedulable,
Labels: labels,
Annotations: annotations,
Taints: taints,
USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]), USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]),
USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]), USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]),
}) })

View File

@ -37,6 +37,11 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
t.Fatal("expected persistTargets to fail when parent is a file") t.Fatal("expected persistTargets to fail when parent is a file")
} }
app.settings.DesiredMetadataPath = filepath.Join(fileParent, "desired-node-metadata.json")
if err := app.persistDesiredNodeMetadata(); err == nil {
t.Fatal("expected persistDesiredNodeMetadata to fail when parent is a file")
}
invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json") invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json")
if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil { if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err) t.Fatal(err)
@ -45,6 +50,15 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
if err := app.loadArtifacts(); err == nil { if err := app.loadArtifacts(); err == nil {
t.Fatal("expected loadArtifacts to reject invalid json") t.Fatal("expected loadArtifacts to reject invalid json")
} }
invalidDesiredState := filepath.Join(t.TempDir(), "desired-node-metadata.json")
if err := os.WriteFile(invalidDesiredState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err)
}
app.settings.DesiredMetadataPath = invalidDesiredState
if err := app.loadDesiredNodeMetadata(); err == nil {
t.Fatal("expected loadDesiredNodeMetadata to reject invalid json")
}
} }
func TestServiceReplacementAndDeviceBranches(t *testing.T) { func TestServiceReplacementAndDeviceBranches(t *testing.T) {

View File

@ -162,6 +162,7 @@ nodes:
snapshotsPath := filepath.Join(dir, "snapshots.json") snapshotsPath := filepath.Join(dir, "snapshots.json")
targetsPath := filepath.Join(dir, "targets.json") targetsPath := filepath.Join(dir, "targets.json")
artifactStatePath := filepath.Join(dir, "artifacts.json") artifactStatePath := filepath.Join(dir, "artifacts.json")
desiredMetadataPath := filepath.Join(dir, "desired-node-metadata.json")
seedSnapshots := map[string]SnapshotRecord{ seedSnapshots := map[string]SnapshotRecord{
"titan-15": { "titan-15": {
@ -190,19 +191,33 @@ nodes:
if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil { if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
seedDesiredMetadata := map[string]DesiredNodeMetadata{
"titan-15": {
Node: "titan-15",
Hostname: "titan-15",
CapturedAt: testTime(t),
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
},
}
data, _ = json.MarshalIndent(seedDesiredMetadata, "", " ")
if err := os.WriteFile(desiredMetadataPath, data, 0o644); err != nil {
t.Fatal(err)
}
app, err := NewApp(Settings{ app, err := NewApp(Settings{
InventoryPath: invPath, InventoryPath: invPath,
CacheDir: filepath.Join(dir, "cache"), CacheDir: filepath.Join(dir, "cache"),
ArtifactDir: filepath.Join(dir, "artifacts"), ArtifactDir: filepath.Join(dir, "artifacts"),
ArtifactStatePath: artifactStatePath, ArtifactStatePath: artifactStatePath,
HistoryPath: filepath.Join(dir, "history.jsonl"), HistoryPath: filepath.Join(dir, "history.jsonl"),
SnapshotsPath: snapshotsPath, SnapshotsPath: snapshotsPath,
TargetsPath: targetsPath, TargetsPath: targetsPath,
DefaultFlashHost: "titan-22", DesiredMetadataPath: desiredMetadataPath,
FlashHosts: []string{"titan-22"}, DefaultFlashHost: "titan-22",
LocalHost: "titan-22", FlashHosts: []string{"titan-22"},
AllowedGroups: []string{"admin"}, LocalHost: "titan-22",
AllowedGroups: []string{"admin"},
}) })
if err != nil { if err != nil {
t.Fatalf("NewApp: %v", err) t.Fatalf("NewApp: %v", err)
@ -211,6 +226,9 @@ nodes:
if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" { if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" {
t.Fatalf("artifacts() = %q", got) t.Fatalf("artifacts() = %q", got)
} }
if desired, ok := app.desiredMetadataForNode("titan-15"); !ok || desired.Labels["hardware"] != "rpi5" {
t.Fatalf("desiredMetadataForNode() = %#v ok=%v", desired, ok)
}
if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil { if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil {
t.Fatalf("recordArtifact: %v", err) t.Fatalf("recordArtifact: %v", err)
} }

View File

@ -0,0 +1,483 @@
package service
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"metis/pkg/config"
)
// DesiredNodeMetadata captures the node identity Metis should preserve through
// recovery builds and re-assert after the node rejoins the cluster.
type DesiredNodeMetadata struct {
Node string `json:"node"`
Hostname string `json:"hostname,omitempty"`
CapturedAt time.Time `json:"captured_at,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Taints []string `json:"taints,omitempty"`
Unschedulable bool `json:"unschedulable,omitempty"`
}
func (a *App) loadDesiredNodeMetadata() error {
data, err := os.ReadFile(a.settings.DesiredMetadataPath)
if err != nil {
return err
}
var desired map[string]DesiredNodeMetadata
if err := json.Unmarshal(data, &desired); err != nil {
return err
}
a.mu.Lock()
a.desiredMetadata = desired
a.mu.Unlock()
return nil
}
func (a *App) persistDesiredNodeMetadata() error {
a.mu.RLock()
data, err := json.MarshalIndent(a.desiredMetadata, "", " ")
a.mu.RUnlock()
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(a.settings.DesiredMetadataPath), 0o755); err != nil {
return err
}
return os.WriteFile(a.settings.DesiredMetadataPath, data, 0o644)
}
func (a *App) desiredMetadataForNode(node string) (DesiredNodeMetadata, bool) {
node = strings.TrimSpace(node)
if node == "" {
return DesiredNodeMetadata{}, false
}
a.mu.RLock()
defer a.mu.RUnlock()
desired, ok := a.desiredMetadata[node]
if !ok {
return DesiredNodeMetadata{}, false
}
return cloneDesiredNodeMetadata(desired), true
}
func (a *App) stageDesiredNodeMetadata(nodeName string) (DesiredNodeMetadata, error) {
nodeName = strings.TrimSpace(nodeName)
if nodeName == "" {
return DesiredNodeMetadata{}, fmt.Errorf("node metadata requires a node name")
}
nodeSpec, _, err := a.inventory.FindNode(nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
cfg, err := config.Build(a.inventory, nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
desired := DesiredNodeMetadata{
Node: nodeName,
Hostname: strings.TrimSpace(nodeSpec.Hostname),
CapturedAt: time.Now().UTC(),
Labels: filteredRestorableLabels(cfg.Labels),
Taints: restorableTaints(cfg.Taints),
}
if existing, ok := a.desiredMetadataForNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, existing)
}
if live, ok := liveClusterNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, desiredMetadataFromCluster(*live))
}
desired.Labels = normalizeStringMap(desired.Labels)
desired.Annotations = normalizeStringMap(desired.Annotations)
desired.Taints = normalizeTaints(desired.Taints)
a.mu.Lock()
if a.desiredMetadata == nil {
a.desiredMetadata = map[string]DesiredNodeMetadata{}
}
a.desiredMetadata[nodeName] = desired
a.mu.Unlock()
if err := a.persistDesiredNodeMetadata(); err != nil {
return DesiredNodeMetadata{}, err
}
return cloneDesiredNodeMetadata(desired), nil
}
func (a *App) syncDesiredNodeMetadata(record SnapshotRecord) error {
desired, ok := a.desiredMetadataForNode(record.Node)
if !ok {
return nil
}
live, ok := liveClusterNode(record.Node)
if !ok {
return nil
}
return patchDesiredNodeMetadata(*live, desired)
}
func desiredMetadataFromCluster(node clusterNode) DesiredNodeMetadata {
return DesiredNodeMetadata{
Node: strings.TrimSpace(node.Name),
Labels: filteredRestorableLabels(node.Labels),
Annotations: filteredRestorableAnnotations(node.Annotations),
Taints: restorableTaints(node.Taints),
Unschedulable: node.Unschedulable,
}
}
func mergeDesiredNodeMetadata(base, overlay DesiredNodeMetadata) DesiredNodeMetadata {
merged := cloneDesiredNodeMetadata(base)
if hostname := strings.TrimSpace(overlay.Hostname); hostname != "" {
merged.Hostname = hostname
}
if !overlay.CapturedAt.IsZero() {
merged.CapturedAt = overlay.CapturedAt
}
if merged.Labels == nil {
merged.Labels = map[string]string{}
}
for key, value := range overlay.Labels {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Labels[key] = strings.TrimSpace(value)
}
if merged.Annotations == nil {
merged.Annotations = map[string]string{}
}
for key, value := range overlay.Annotations {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Annotations[key] = strings.TrimSpace(value)
}
if len(overlay.Taints) > 0 {
merged.Taints = normalizeTaints(overlay.Taints)
}
merged.Unschedulable = overlay.Unschedulable
return merged
}
func patchDesiredNodeMetadata(live clusterNode, desired DesiredNodeMetadata) error {
node := strings.TrimSpace(desired.Node)
if node == "" {
node = strings.TrimSpace(live.Name)
}
if node == "" {
return nil
}
labelPatch := metadataStringPatch(live.Labels, desired.Labels, isRestorableLabel)
annotationPatch := metadataStringPatch(live.Annotations, desired.Annotations, isRestorableAnnotation)
mergedTaints := mergeLiveAndDesiredTaints(live.Taints, desired.Taints)
body := map[string]any{}
metadata := map[string]any{}
if len(labelPatch) > 0 {
metadata["labels"] = labelPatch
}
if len(annotationPatch) > 0 {
metadata["annotations"] = annotationPatch
}
if len(metadata) > 0 {
body["metadata"] = metadata
}
spec := map[string]any{}
if live.Unschedulable != desired.Unschedulable {
spec["unschedulable"] = desired.Unschedulable
}
if !sameTaints(live.Taints, mergedTaints) {
spec["taints"] = taintPatchPayload(mergedTaints)
}
if len(spec) > 0 {
body["spec"] = spec
}
if len(body) == 0 {
return nil
}
kube, err := kubeClientFactory()
if err != nil {
return err
}
return kube.mergePatch("/api/v1/nodes/"+node, body)
}
func metadataStringPatch(live, desired map[string]string, allow func(string) bool) map[string]any {
patch := map[string]any{}
for key, value := range desired {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
value = strings.TrimSpace(value)
if strings.TrimSpace(live[key]) != value {
patch[key] = value
}
}
for key := range live {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
if _, ok := desired[key]; !ok {
patch[key] = nil
}
}
return patch
}
func liveClusterNode(node string) (*clusterNode, bool) {
node = strings.TrimSpace(node)
if node == "" {
return nil, false
}
for _, live := range clusterNodes() {
if strings.TrimSpace(live.Name) == node {
copyNode := live
return &copyNode, true
}
}
return nil, false
}
func filteredRestorableLabels(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableLabel(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func filteredRestorableAnnotations(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableAnnotation(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func normalizeStringMap(values map[string]string) map[string]string {
if len(values) == 0 {
return nil
}
normalized := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" {
continue
}
normalized[key] = strings.TrimSpace(value)
}
if len(normalized) == 0 {
return nil
}
return normalized
}
func restorableTaints(values []string) []string {
filtered := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" || !isRestorableTaint(value) {
continue
}
filtered = append(filtered, value)
}
return normalizeTaints(filtered)
}
func normalizeTaints(values []string) []string {
if len(values) == 0 {
return nil
}
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
if len(out) == 0 {
return nil
}
return out
}
func normalizeTaint(value string) string {
return strings.TrimSpace(value)
}
func sameTaints(left, right []string) bool {
left = normalizeTaints(left)
right = normalizeTaints(right)
if len(left) != len(right) {
return false
}
for idx := range left {
if left[idx] != right[idx] {
return false
}
}
return true
}
func mergeLiveAndDesiredTaints(live, desired []string) []string {
merged := make([]string, 0, len(live)+len(desired))
for _, taint := range live {
taint = normalizeTaint(taint)
if taint == "" || isRestorableTaint(taint) {
continue
}
merged = append(merged, taint)
}
merged = append(merged, restorableTaints(desired)...)
return normalizeTaints(merged)
}
func taintPatchPayload(values []string) []map[string]string {
payload := make([]map[string]string, 0, len(values))
for _, value := range normalizeTaints(values) {
key, taintValue, effect := splitTaint(value)
if key == "" {
continue
}
entry := map[string]string{"key": key}
if taintValue != "" {
entry["value"] = taintValue
}
if effect != "" {
entry["effect"] = effect
}
payload = append(payload, entry)
}
return payload
}
func splitTaint(raw string) (string, string, string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", "", ""
}
effect := ""
body := raw
if idx := strings.LastIndex(raw, ":"); idx >= 0 {
body = strings.TrimSpace(raw[:idx])
effect = strings.TrimSpace(raw[idx+1:])
}
key := body
value := ""
if idx := strings.Index(body, "="); idx >= 0 {
key = strings.TrimSpace(body[:idx])
value = strings.TrimSpace(body[idx+1:])
}
return strings.TrimSpace(key), value, effect
}
func isRestorableTaint(raw string) bool {
key, _, _ := splitTaint(raw)
if key == "" {
return false
}
for _, prefix := range []string{
"node.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
"ToBeDeletedByClusterAutoscaler",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableLabel(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
if strings.HasPrefix(key, "node-role.kubernetes.io/") {
return true
}
for _, prefix := range []string{
"kubernetes.io/",
"beta.kubernetes.io/",
"node.kubernetes.io/",
"topology.kubernetes.io/",
"feature.node.kubernetes.io/",
"failure-domain.beta.kubernetes.io/",
"nvidia.com/",
"k3s.io/",
"rke2.io/",
"volumes.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableAnnotation(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
for _, prefix := range []string{
"kubectl.kubernetes.io/",
"kubeadm.alpha.kubernetes.io/",
"kubernetes.io/",
"node.alpha.kubernetes.io/",
"node.kubernetes.io/",
"volumes.kubernetes.io/",
"csi.volume.kubernetes.io/",
"csi.storage.k8s.io/",
"flannel.alpha.coreos.com/",
"projectcalico.org/",
"rke2.io/",
"k3s.io/",
"nvidia.com/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func cloneDesiredNodeMetadata(value DesiredNodeMetadata) DesiredNodeMetadata {
clone := value
clone.Labels = normalizeStringMap(value.Labels)
clone.Annotations = normalizeStringMap(value.Annotations)
clone.Taints = normalizeTaints(value.Taints)
return clone
}
func desiredNodeMetadataSyncEvent(node string, err error) Event {
return Event{
Time: time.Now().UTC(),
Kind: "sentinel.node-metadata",
Summary: fmt.Sprintf("Could not restore desired node metadata for %s", node),
Details: map[string]any{
"node": node,
"error": err.Error(),
},
}
}

View File

@ -0,0 +1,254 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"
"metis/pkg/sentinel"
)
func TestStageDesiredNodeMetadataMergesInventoryAndLiveCluster(t *testing.T) {
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi5",
"rack": "a1",
"maintenance.bstein.dev/color": "blue",
"kubernetes.io/arch": "arm64",
"node-role.kubernetes.io/worker": "true",
},
"annotations": map[string]string{
"maintenance.bstein.dev/owner": "atlas",
"volumes.kubernetes.io/controller-managed-attach-detach": "true",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "recovery", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.inventory.Nodes[0].Labels = map[string]string{"hardware": "rpi4", "rack": "a1"}
app.inventory.Nodes[0].Taints = []string{"flash=true:NoSchedule"}
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Annotations: map[string]string{"maintenance.bstein.dev/legacy": "keep"},
}
desired, err := app.stageDesiredNodeMetadata("titan-15")
if err != nil {
t.Fatalf("stageDesiredNodeMetadata: %v", err)
}
if desired.Hostname != "titan-15" || !desired.Unschedulable {
t.Fatalf("unexpected desired metadata header: %#v", desired)
}
if desired.Labels["hardware"] != "rpi5" || desired.Labels["rack"] != "a1" || desired.Labels["maintenance.bstein.dev/color"] != "blue" {
t.Fatalf("unexpected desired labels: %#v", desired.Labels)
}
if _, ok := desired.Labels["kubernetes.io/arch"]; ok {
t.Fatalf("system labels should not be persisted: %#v", desired.Labels)
}
if desired.Annotations["maintenance.bstein.dev/owner"] != "atlas" || desired.Annotations["maintenance.bstein.dev/legacy"] != "keep" {
t.Fatalf("unexpected desired annotations: %#v", desired.Annotations)
}
if _, ok := desired.Annotations["volumes.kubernetes.io/controller-managed-attach-detach"]; ok {
t.Fatalf("controller annotations should not be persisted: %#v", desired.Annotations)
}
if !reflect.DeepEqual(desired.Taints, []string{"dedicated=recovery:NoSchedule"}) {
t.Fatalf("unexpected desired taints: %#v", desired.Taints)
}
data, err := os.ReadFile(app.settings.DesiredMetadataPath)
if err != nil {
t.Fatalf("read desired metadata file: %v", err)
}
if !strings.Contains(string(data), "titan-15") {
t.Fatalf("desired metadata file missing titan-15: %s", string(data))
}
}
func TestStoreSnapshotRestoresDesiredNodeMetadata(t *testing.T) {
var patchBody map[string]any
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi4",
"maintenance.bstein.dev/old": "1",
},
"annotations": map[string]string{
"maintenance.bstein.dev/mode": "old",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "old", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/nodes/titan-15":
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
t.Fatalf("decode patch: %v", err)
}
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Hostname: "titan-15",
Labels: map[string]string{"hardware": "rpi5"},
Annotations: map[string]string{"maintenance.bstein.dev/mode": "recovery"},
Taints: []string{"dedicated=recovery:NoSchedule"},
Unschedulable: false,
}
if err := app.StoreSnapshot(SnapshotRecord{
Node: "titan-15",
CollectedAt: time.Date(2026, 4, 24, 6, 0, 0, 0, time.UTC),
Snapshot: sentinel.Snapshot{Hostname: "titan-15"},
}); err != nil {
t.Fatalf("StoreSnapshot: %v", err)
}
if patchBody == nil {
t.Fatal("expected desired metadata patch")
}
metadata := patchBody["metadata"].(map[string]any)
labels := metadata["labels"].(map[string]any)
if labels["hardware"] != "rpi5" || labels["maintenance.bstein.dev/old"] != nil {
t.Fatalf("unexpected label patch: %#v", labels)
}
annotations := metadata["annotations"].(map[string]any)
if annotations["maintenance.bstein.dev/mode"] != "recovery" {
t.Fatalf("unexpected annotation patch: %#v", annotations)
}
spec := patchBody["spec"].(map[string]any)
if spec["unschedulable"] != false {
t.Fatalf("unexpected spec patch: %#v", spec)
}
taints := spec["taints"].([]any)
if len(taints) != 2 {
t.Fatalf("unexpected taint payload: %#v", taints)
}
entries := map[string]map[string]any{}
for _, raw := range taints {
entry := raw.(map[string]any)
key := entry["key"].(string)
entries[key] = entry
}
if entries["dedicated"]["value"] != "recovery" || entries["dedicated"]["effect"] != "NoSchedule" {
t.Fatalf("missing desired taint replacement: %#v", entries)
}
if entries["node.kubernetes.io/unreachable"]["effect"] != "NoExecute" {
t.Fatalf("system taint should be preserved: %#v", entries)
}
}
func TestDesiredNodeMetadataHelpers(t *testing.T) {
app := newTestApp(t)
if _, ok := app.desiredMetadataForNode("missing"); ok {
t.Fatal("expected no desired metadata for missing node")
}
if err := app.syncDesiredNodeMetadata(SnapshotRecord{Node: "missing"}); err != nil {
t.Fatalf("syncDesiredNodeMetadata missing should noop: %v", err)
}
if _, ok := liveClusterNode(""); ok {
t.Fatal("empty liveClusterNode lookup should fail")
}
if !isRestorableLabel("maintenance.bstein.dev/role") || isRestorableLabel("kubernetes.io/arch") {
t.Fatal("unexpected label restoration filter")
}
if !isRestorableAnnotation("maintenance.bstein.dev/state") || isRestorableAnnotation("volumes.kubernetes.io/foo") {
t.Fatal("unexpected annotation restoration filter")
}
if !isRestorableTaint("dedicated=recovery:NoSchedule") || isRestorableTaint("node.kubernetes.io/not-ready:NoExecute") {
t.Fatal("unexpected taint restoration filter")
}
key, value, effect := splitTaint("dedicated=recovery:NoSchedule")
if key != "dedicated" || value != "recovery" || effect != "NoSchedule" {
t.Fatalf("splitTaint mismatch: %q %q %q", key, value, effect)
}
if key, value, effect := splitTaint("just-a-key"); key != "just-a-key" || value != "" || effect != "" {
t.Fatalf("splitTaint key-only mismatch: %q %q %q", key, value, effect)
}
labels := filteredRestorableLabels(map[string]string{"hardware": "rpi5", "kubernetes.io/arch": "arm64"})
if !reflect.DeepEqual(labels, map[string]string{"hardware": "rpi5"}) {
t.Fatalf("filteredRestorableLabels = %#v", labels)
}
annotations := filteredRestorableAnnotations(map[string]string{"maintenance.bstein.dev/state": "ok", "volumes.kubernetes.io/foo": "bar"})
if !reflect.DeepEqual(annotations, map[string]string{"maintenance.bstein.dev/state": "ok"}) {
t.Fatalf("filteredRestorableAnnotations = %#v", annotations)
}
patch := metadataStringPatch(
map[string]string{"hardware": "rpi4", "maintenance.bstein.dev/old": "1"},
map[string]string{"hardware": "rpi5"},
isRestorableLabel,
)
if patch["hardware"] != "rpi5" || patch["maintenance.bstein.dev/old"] != nil {
t.Fatalf("metadataStringPatch = %#v", patch)
}
mergedTaints := mergeLiveAndDesiredTaints(
[]string{"node.kubernetes.io/unreachable:NoExecute", "dedicated=old:NoSchedule"},
[]string{"dedicated=new:NoSchedule", "dedicated=new:NoSchedule"},
)
if !reflect.DeepEqual(mergedTaints, []string{"dedicated=new:NoSchedule", "node.kubernetes.io/unreachable:NoExecute"}) {
t.Fatalf("mergeLiveAndDesiredTaints = %#v", mergedTaints)
}
payload := taintPatchPayload([]string{"dedicated=new:NoSchedule"})
if len(payload) != 1 || payload[0]["key"] != "dedicated" || payload[0]["value"] != "new" || payload[0]["effect"] != "NoSchedule" {
t.Fatalf("taintPatchPayload = %#v", payload)
}
original := DesiredNodeMetadata{Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}
cloned := cloneDesiredNodeMetadata(original)
cloned.Labels["hardware"] = "mutated"
cloned.Taints[0] = "changed"
if original.Labels["hardware"] != "rpi5" || original.Taints[0] != "dedicated=new:NoSchedule" {
t.Fatalf("cloneDesiredNodeMetadata should deep-copy slices/maps: %#v %#v", original, cloned)
}
if err := patchDesiredNodeMetadata(
clusterNode{Name: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
DesiredNodeMetadata{Node: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
); err != nil {
t.Fatalf("patchDesiredNodeMetadata should noop when already in sync: %v", err)
}
if event := desiredNodeMetadataSyncEvent("titan-15", os.ErrPermission); event.Kind != "sentinel.node-metadata" || event.Details["node"] != "titan-15" {
t.Fatalf("desiredNodeMetadataSyncEvent = %#v", event)
}
}

View File

@ -74,12 +74,17 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
} }
func (a *App) runBuild(job *Job, flash bool) { func (a *App) runBuild(job *Job, flash bool) {
_, class, err := a.inventory.FindNode(job.Node) nodeSpec, class, err := a.inventory.FindNode(job.Node)
if err != nil { if err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
return return
} }
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
return
}
if err := a.ensureHarborProject(); err != nil { if err := a.ensureHarborProject(); err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
@ -112,7 +117,8 @@ func (a *App) runBuild(job *Job, flash bool) {
return return
} }
buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano()) buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano())
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag)) job.Builder = builder.Name
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, strings.TrimSpace(nodeSpec.Hostname), artifactRef, buildTag))
if err != nil { if err != nil {
a.failJob(job.ID, err) a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error") a.metrics.RecordBuild(job.Node, "error")
@ -183,6 +189,9 @@ func (a *App) runFlash(job *Job) {
} }
func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) { func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
return RemoteFlashResult{}, err
}
a.setJob(job.ID, func(j *Job) { a.setJob(job.ID, func(j *Job) {
j.Status = JobRunning j.Status = JobRunning
j.Stage = "preflight" j.Stage = "preflight"

View File

@ -1,6 +1,7 @@
package service package service
import ( import (
"encoding/json"
"fmt" "fmt"
"math" "math"
"path/filepath" "path/filepath"
@ -246,8 +247,9 @@ func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any {
} }
} }
func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any { func (a *App) remoteBuildPodSpec(name, host, image, node, nodeHostname, artifactRef, buildTag string) map[string]any {
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name) workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
desiredEnv := remoteDesiredMetadataEnv(a, node)
return map[string]any{ return map[string]any{
"apiVersion": "v1", "apiVersion": "v1",
"kind": "Pod", "kind": "Pod",
@ -255,7 +257,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
"name": name, "name": name,
"namespace": a.settings.Namespace, "namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}, "labels": map[string]string{"app": "metis-remote", "metis-run": "build"},
"annotations": vaultRuntimeAnnotations(true), "annotations": vaultRuntimeAnnotations(true, nodeHostname),
}, },
"spec": map[string]any{ "spec": map[string]any{
"restartPolicy": "Never", "restartPolicy": "Never",
@ -283,6 +285,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
), ),
}, },
"securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0}, "securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0},
"env": desiredEnv,
"envFrom": []map[string]any{ "envFrom": []map[string]any{
{"configMapRef": map[string]any{"name": "metis"}}, {"configMapRef": map[string]any{"name": "metis"}},
}, },
@ -309,7 +312,7 @@ func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef st
"name": name, "name": name,
"namespace": a.settings.Namespace, "namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "flash"}, "labels": map[string]string{"app": "metis-remote", "metis-run": "flash"},
"annotations": vaultRuntimeAnnotations(false), "annotations": vaultRuntimeAnnotations(false, ""),
}, },
"spec": map[string]any{ "spec": map[string]any{
"restartPolicy": "Never", "restartPolicy": "Never",
@ -378,7 +381,46 @@ func mountedHostTmpDir(path string) string {
return "/host-tmp" return "/host-tmp"
} }
func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string { func remoteDesiredMetadataEnv(a *App, node string) []map[string]any {
desired, ok := a.desiredMetadataForNode(node)
if !ok {
return nil
}
labelsJSON, _ := jsonMarshalStringMap(desired.Labels)
taintsJSON, _ := jsonMarshalStringSlice(desired.Taints)
env := []map[string]any{}
if labelsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_LABELS_JSON", "value": labelsJSON})
}
if taintsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_TAINTS_JSON", "value": taintsJSON})
}
return env
}
func jsonMarshalStringMap(values map[string]string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func jsonMarshalStringSlice(values []string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func vaultRuntimeAnnotations(includeSSHKeys bool, nodeHostname string) map[string]string {
annotations := map[string]string{ annotations := map[string]string{
"vault.hashicorp.com/agent-inject": "true", "vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/agent-pre-populate-only": "true", "vault.hashicorp.com/agent-pre-populate-only": "true",
@ -399,6 +441,19 @@ export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}" export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}" export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}" export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
{{ end }}`
}
nodeHostname = strings.TrimSpace(nodeHostname)
if nodeHostname != "" {
secretPath := fmt.Sprintf("secret/data/nodes/%s", nodeHostname)
annotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] = secretPath
annotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"] = `{{ with secret "` + secretPath + `" }}
export METIS_NODE_SSH_PASSWORD="{{ .Data.data.ssh_password }}"
export METIS_NODE_SSH_PASSWORD_HASH="{{ .Data.data.ssh_password_hash }}"
export METIS_NODE_ATLAS_PASSWORD="{{ .Data.data.atlas_password }}"
export METIS_NODE_ATLAS_PASSWORD_HASH="{{ .Data.data.atlas_password_hash }}"
export METIS_NODE_ROOT_PASSWORD="{{ .Data.data.root_password }}"
export METIS_NODE_ROOT_PASSWORD_HASH="{{ .Data.data.root_password_hash }}"
{{ end }}` {{ end }}`
} }
return annotations return annotations
@ -413,6 +468,7 @@ func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string {
if includeSSHKeys { if includeSSHKeys {
lines = append(lines, ". /vault/secrets/metis-ssh-env.sh") lines = append(lines, ". /vault/secrets/metis-ssh-env.sh")
} }
lines = append(lines, "if [ -f /vault/secrets/metis-node-secrets-env.sh ]; then . /vault/secrets/metis-node-secrets-env.sh; fi")
lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...)) lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...))
return strings.Join(lines, "\n") return strings.Join(lines, "\n")
} }

View File

@ -251,8 +251,13 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
app := newTestApp(t) app := newTestApp(t)
app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace" app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace"
app.settings.HostTmpDir = "/var/tmp/metis-flash-test" app.settings.HostTmpDir = "/var/tmp/metis-flash-test"
app.desiredMetadata["titan-10"] = DesiredNodeMetadata{
Node: "titan-10",
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
}
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "registry.example/metis/titan-10", "build-1") buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "titan-10", "registry.example/metis/titan-10", "build-1")
buildBody := buildSpec["spec"].(map[string]any) buildBody := buildSpec["spec"].(map[string]any)
buildVolumes := buildBody["volumes"].([]map[string]any) buildVolumes := buildBody["volumes"].([]map[string]any)
workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any) workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any)
@ -260,6 +265,17 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
t.Fatalf("build workspace hostPath = %v", got) t.Fatalf("build workspace hostPath = %v", got)
} }
buildContainer := buildBody["containers"].([]map[string]any)[0] buildContainer := buildBody["containers"].([]map[string]any)[0]
buildEnv := buildContainer["env"].([]map[string]any)
if len(buildEnv) != 2 {
t.Fatalf("expected desired metadata env, got %#v", buildEnv)
}
metadataAnnotations := buildSpec["metadata"].(map[string]any)["annotations"].(map[string]string)
if metadataAnnotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] != "secret/data/nodes/titan-10" {
t.Fatalf("unexpected node secret annotation: %#v", metadataAnnotations)
}
if !strings.Contains(metadataAnnotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"], "METIS_NODE_ROOT_PASSWORD") {
t.Fatalf("expected node password exports in vault template: %#v", metadataAnnotations)
}
buildSecurity := buildContainer["securityContext"].(map[string]any) buildSecurity := buildContainer["securityContext"].(map[string]any)
if got := buildSecurity["runAsUser"]; got != 0 { if got := buildSecurity["runAsUser"]; got != 0 {
t.Fatalf("build runAsUser = %v", got) t.Fatalf("build runAsUser = %v", got)

View File

@ -15,7 +15,7 @@ func TestMountedHostTmpDirMapsConfiguredTmpPathIntoMount(t *testing.T) {
} }
func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) { func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
withKeys := vaultRuntimeAnnotations(true) withKeys := vaultRuntimeAnnotations(true, "titan-15")
template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"] template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]
if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") { if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") {
t.Fatalf("expected tethys hecate key export in vault template: %q", template) t.Fatalf("expected tethys hecate key export in vault template: %q", template)
@ -24,7 +24,7 @@ func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
t.Fatalf("expected db hecate key export in vault template: %q", template) t.Fatalf("expected db hecate key export in vault template: %q", template)
} }
withoutKeys := vaultRuntimeAnnotations(false) withoutKeys := vaultRuntimeAnnotations(false, "")
if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok { if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok {
t.Fatalf("did not expect ssh key template when includeSSHKeys=false") t.Fatalf("did not expect ssh key template when includeSSHKeys=false")
} }

View File

@ -11,30 +11,31 @@ var hostNameLookup = os.Hostname
// Settings configures the Metis service runtime. // Settings configures the Metis service runtime.
type Settings struct { type Settings struct {
BindAddr string BindAddr string
InventoryPath string InventoryPath string
CacheDir string CacheDir string
ArtifactDir string ArtifactDir string
ArtifactStatePath string ArtifactStatePath string
HistoryPath string HistoryPath string
SnapshotsPath string SnapshotsPath string
TargetsPath string TargetsPath string
DefaultFlashHost string DesiredMetadataPath string
FlashHosts []string DefaultFlashHost string
LocalHost string FlashHosts []string
AllowedGroups []string LocalHost string
MaxDeviceBytes int64 AllowedGroups []string
Namespace string MaxDeviceBytes int64
RunnerImageAMD64 string Namespace string
RunnerImageARM64 string RunnerImageAMD64 string
HarborRegistry string RunnerImageARM64 string
HarborProject string HarborRegistry string
HarborAPIBase string HarborProject string
HarborUsername string HarborAPIBase string
HarborPassword string HarborUsername string
HostTmpDir string HarborPassword string
RemoteWorkspaceDir string HostTmpDir string
RemotePodTimeout int64 RemoteWorkspaceDir string
RemotePodTimeout int64
} }
// FromEnv builds service settings with sensible defaults for local dev and in-cluster use. // FromEnv builds service settings with sensible defaults for local dev and in-cluster use.
@ -44,30 +45,31 @@ func FromEnv() Settings {
defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost) defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost)
flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost)) flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost))
return Settings{ return Settings{
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"), BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"), InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")), CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")), ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")), ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")),
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
DefaultFlashHost: defaultFlashHost, DesiredMetadataPath: getenvDefault("METIS_DESIRED_METADATA_PATH", filepath.Join(dataDir, "desired-node-metadata.json")),
FlashHosts: flashHosts, DefaultFlashHost: defaultFlashHost,
LocalHost: localHost, FlashHosts: flashHosts,
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")), LocalHost: localHost,
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000), AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")),
Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"), MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""), Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"),
RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""), RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""),
HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""),
HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"), HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"),
HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"), HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"),
HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""), HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"),
HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""), HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""),
HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"), HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""),
RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"), HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"),
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800), RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"),
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800),
} }
} }