recovery(metis): restore node identity on rebuilt images

This commit is contained in:
codex 2026-04-24 16:57:34 -03:00
parent ebaa367efd
commit 17069e4677
19 changed files with 1612 additions and 144 deletions

View File

@ -154,6 +154,10 @@ if [ -s "${sudoers_file}" ]; then
fi
fi
if [ -x /usr/local/sbin/metis-apply-node-identity.sh ]; then
/usr/local/sbin/metis-apply-node-identity.sh || true
fi
rm -f /root/.not_logged_in_yet
if ! command -v k3s >/dev/null 2>&1; then

View File

@ -2,8 +2,6 @@ package plan
import (
"bytes"
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
@ -64,6 +62,7 @@ func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error)
cfg.Secrets = sec.Extra
}
}
applyNodeMetadataEnv(cfg)
files, err := buildFiles(cfg, sec)
if err != nil {
return nil, err
@ -111,7 +110,9 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
{Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true},
{Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true},
{Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true},
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true},
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg, sec)), Mode: 0o600, RootFS: true},
{Path: "usr/local/sbin/metis-apply-node-identity.sh", Content: []byte(nodeIdentityScriptContent()), Mode: 0o755, RootFS: true},
{Path: "etc/cloud/cloud.cfg.d/90-metis-recovery.cfg", Content: []byte(cloudInitRootFSContent(sec)), Mode: 0o644, RootFS: true},
}
if cfg.IP != "" {
files = append(files, inject.FileSpec{
@ -148,6 +149,14 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true,
})
}
if passwordAuth := sshPasswordConfigContent(sec); passwordAuth != "" {
files = append(files, inject.FileSpec{
Path: "etc/ssh/sshd_config.d/90-metis-password-auth.conf",
Content: []byte(passwordAuth),
Mode: 0o644,
RootFS: true,
})
}
if cfg.SSHUser == "atlas" {
sudoers := hecateSudoersContent(cfg.SSHUser)
files = append(files, inject.FileSpec{
@ -172,8 +181,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
})
}
// Store the raw config for debugging/ops.
raw, err := json.MarshalIndent(cfg, "", " ")
raw, err := jsonMarshalIndent(cfg)
if err != nil {
return nil, err
}
@ -184,7 +192,7 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
RootFS: true,
})
if sec != nil {
secRaw, err := json.MarshalIndent(sec, "", " ")
secRaw, err := jsonMarshalIndent(redactedSecretsForImage(sec))
if err != nil {
return nil, err
}
@ -196,7 +204,6 @@ func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.File
})
}
// Optional cloud-init for images that honor NoCloud.
userData := cloudInitUserData(cfg, sec)
if userData != "" {
files = append(files, inject.FileSpec{
@ -267,33 +274,6 @@ func allowK3sNodeLabel(role, key string) bool {
return !strings.HasPrefix(key, "node-role.kubernetes.io/")
}
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
return b.String()
}
func networkManagerConnectionContent(id, iface, ip string) string {
gateway := ip
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
@ -347,7 +327,6 @@ func fstabAppendContent(cfg *config.NodeConfig) string {
source := entry.Source
switch {
case source != "":
// Use the explicit source path for bind mounts.
case entry.UUID != "":
source = "UUID=" + entry.UUID
case entry.Label != "":
@ -374,25 +353,6 @@ func hecateSudoersContent(user string) string {
)
}
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}
func fetchSecrets(hostname string) *secrets.NodeSecrets {
if os.Getenv("VAULT_ADDR") == "" {
return nil
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return nil
}
return sec
}
func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) {
var files []inject.FileSpec
if class == nil {

View File

@ -174,3 +174,80 @@ func TestBuildFilesAddsHecateSudoersForAtlas(t *testing.T) {
t.Fatalf("metis sudoers backup missing/incorrect: %s", backup)
}
}
func TestBuildFilesAddsPasswordArtifactsAndRedactsSecrets(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
IP: "192.168.22.43",
SSHUser: "atlas",
SSHKeys: []string{"ssh-ed25519 AAA test"},
K3s: config.K3sConfig{
Role: "agent",
Version: "v1.31.5+k3s1",
},
}
sec := &secrets.NodeSecrets{
SSHPassword: "atlas-pass",
RootPassword: "root-pass",
K3sToken: "super-secret-token",
Extra: map[string]string{"api_key": "secret"},
}
files, err := buildFiles(cfg, sec)
if err != nil {
t.Fatalf("buildFiles: %v", err)
}
pathMap := map[string]string{}
for _, file := range files {
pathMap[file.Path] = string(file.Content)
}
firstboot := pathMap["etc/metis/firstboot.env"]
if !strings.Contains(firstboot, "METIS_ATLAS_PASSWORD='atlas-pass'") || !strings.Contains(firstboot, "METIS_ROOT_PASSWORD='root-pass'") {
t.Fatalf("firstboot env missing password material: %s", firstboot)
}
if sshd := pathMap["etc/ssh/sshd_config.d/90-metis-password-auth.conf"]; !strings.Contains(sshd, "PasswordAuthentication yes") || !strings.Contains(sshd, "PermitRootLogin yes") {
t.Fatalf("password auth config missing: %s", sshd)
}
if script := pathMap["usr/local/sbin/metis-apply-node-identity.sh"]; !strings.Contains(script, "apply_password root") || !strings.Contains(script, "METIS_ATLAS_PASSWORD") {
t.Fatalf("node identity script missing password application: %s", script)
}
if cloudCfg := pathMap["etc/cloud/cloud.cfg.d/90-metis-recovery.cfg"]; !strings.Contains(cloudCfg, "ssh_pwauth: true") {
t.Fatalf("cloud recovery config missing ssh_pwauth: %s", cloudCfg)
}
if userData := pathMap["user-data"]; !strings.Contains(userData, "ssh_pwauth: true") || !strings.Contains(userData, "metis-apply-node-identity.sh") {
t.Fatalf("cloud-init user-data missing recovery hooks: %s", userData)
}
secretsJSON := pathMap["etc/metis/secrets.json"]
if strings.Contains(secretsJSON, "atlas-pass") || strings.Contains(secretsJSON, "root-pass") || strings.Contains(secretsJSON, "super-secret-token") {
t.Fatalf("secrets.json should be redacted: %s", secretsJSON)
}
if !strings.Contains(secretsJSON, `"has_ssh_password": true`) || !strings.Contains(secretsJSON, `"extra_keys": [`) {
t.Fatalf("secrets.json should keep redacted debug metadata: %s", secretsJSON)
}
}
func TestApplyNodeMetadataEnv(t *testing.T) {
cfg := &config.NodeConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
K3s: config.K3sConfig{
Labels: map[string]string{"hardware": "rpi4"},
Taints: []string{"flash=true:NoSchedule"},
},
}
t.Setenv("METIS_NODE_LABELS_JSON", `{"hardware":"rpi5","maintenance.bstein.dev/role":"recovery"}`)
t.Setenv("METIS_NODE_TAINTS_JSON", `["dedicated=recovery:NoSchedule","flash=true:NoSchedule"]`)
applyNodeMetadataEnv(cfg)
if cfg.Labels["hardware"] != "rpi5" || cfg.Labels["maintenance.bstein.dev/role"] != "recovery" {
t.Fatalf("applyNodeMetadataEnv labels = %#v", cfg.Labels)
}
if !strings.Contains(strings.Join(cfg.Taints, ","), "dedicated=recovery:NoSchedule") {
t.Fatalf("applyNodeMetadataEnv taints = %#v", cfg.Taints)
}
cfg = &config.NodeConfig{}
t.Setenv("METIS_NODE_LABELS_JSON", `{bad-json`)
t.Setenv("METIS_NODE_TAINTS_JSON", `{bad-json`)
applyNodeMetadataEnv(cfg)
if cfg.Labels != nil || cfg.Taints != nil {
t.Fatalf("invalid env JSON should be ignored: %#v", cfg)
}
}

262
pkg/plan/node_identity.go Normal file
View File

@ -0,0 +1,262 @@
package plan
import (
"bytes"
"fmt"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
if cfg == nil {
return ""
}
if sec != nil && sec.CloudInit != "" {
return sec.CloudInit
}
var b bytes.Buffer
b.WriteString("#cloud-config\n")
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
if len(cfg.SSHKeys) > 0 {
b.WriteString("ssh_authorized_keys:\n")
for _, k := range cfg.SSHKeys {
b.WriteString(fmt.Sprintf(" - %s\n", k))
}
}
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func firstbootEnvContent(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
b.WriteString("METIS_ATLAS_USER='atlas'\n")
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
if sec != nil {
if value := effectiveAtlasPassword(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD=%s\n", shellQuote(value)))
}
if value := effectiveAtlasPasswordHash(sec); value != "" {
b.WriteString(fmt.Sprintf("METIS_ATLAS_PASSWORD_HASH=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPassword); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD=%s\n", shellQuote(value)))
}
if value := strings.TrimSpace(sec.RootPasswordHash); value != "" {
b.WriteString(fmt.Sprintf("METIS_ROOT_PASSWORD_HASH=%s\n", shellQuote(value)))
}
}
return b.String()
}
func cloudInitRootFSContent(sec *secrets.NodeSecrets) string {
var b bytes.Buffer
b.WriteString("#cloud-config\n")
if hasNodePasswords(sec) {
b.WriteString("ssh_pwauth: true\n")
b.WriteString("disable_root: false\n")
}
b.WriteString("runcmd:\n")
b.WriteString(" - [bash, -lc, \"/usr/local/sbin/metis-apply-node-identity.sh\"]\n")
return b.String()
}
func nodeIdentityScriptContent() string {
return `#!/usr/bin/env bash
set -euo pipefail
marker="/var/lib/metis/node-identity-applied.done"
env_file="/etc/metis/firstboot.env"
key_file="/etc/metis/authorized_keys"
sudoers_file="/etc/metis/sudoers-hecate"
default_groups=(adm sudo tty disk dialout audio video plugdev games users systemd-journal input render netdev)
if [ -f "${marker}" ]; then
exit 0
fi
mkdir -p /var/lib/metis
if [ -f "${env_file}" ]; then
# shellcheck disable=SC1090
. "${env_file}"
fi
atlas_user="${METIS_ATLAS_USER:-atlas}"
ssh_user="${METIS_SSH_USER:-${atlas_user}}"
atlas_password="${METIS_ATLAS_PASSWORD:-}"
atlas_password_hash="${METIS_ATLAS_PASSWORD_HASH:-}"
root_password="${METIS_ROOT_PASSWORD:-}"
root_password_hash="${METIS_ROOT_PASSWORD_HASH:-}"
group_list=()
for group_name in "${default_groups[@]}"; do
if getent group "${group_name}" >/dev/null 2>&1; then
group_list+=("${group_name}")
fi
done
if [ "${#group_list[@]}" -gt 0 ]; then
group_csv="$(IFS=,; printf '%s' "${group_list[*]}")"
else
group_csv=""
fi
ensure_user() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
if ! id "${user_name}" >/dev/null 2>&1; then
if [ -n "${group_csv}" ]; then
useradd -m -s /bin/bash -G "${group_csv}" "${user_name}"
else
useradd -m -s /bin/bash "${user_name}"
fi
elif [ -n "${group_csv}" ]; then
usermod -a -G "${group_csv}" "${user_name}" || true
fi
}
apply_password() {
local user_name="$1"
local plain_password="$2"
local hash_password="$3"
if ! id "${user_name}" >/dev/null 2>&1; then
return 0
fi
if [ -n "${hash_password}" ]; then
usermod -p "${hash_password}" "${user_name}"
passwd -u "${user_name}" >/dev/null 2>&1 || true
return 0
fi
if [ -n "${plain_password}" ]; then
printf '%s:%s\n' "${user_name}" "${plain_password}" | chpasswd
passwd -u "${user_name}" >/dev/null 2>&1 || true
fi
}
install_keys() {
local user_name="$1"
[ -n "${user_name}" ] || return 0
[ -s "${key_file}" ] || return 0
local home_dir
home_dir="$(getent passwd "${user_name}" | cut -d: -f6)"
if [ -z "${home_dir}" ]; then
if [ "${user_name}" = "root" ]; then
home_dir="/root"
else
home_dir="/home/${user_name}"
fi
fi
install -d -m 700 "${home_dir}/.ssh"
install -m 600 "${key_file}" "${home_dir}/.ssh/authorized_keys"
chown -R "${user_name}:${user_name}" "${home_dir}/.ssh" 2>/dev/null || true
}
ensure_user "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
ensure_user "${ssh_user}"
fi
apply_password root "${root_password}" "${root_password_hash}"
apply_password "${atlas_user}" "${atlas_password}" "${atlas_password_hash}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
apply_password "${ssh_user}" "${atlas_password}" "${atlas_password_hash}"
fi
if [ -s "${key_file}" ]; then
install_keys root
install_keys "${atlas_user}"
if [ -n "${ssh_user}" ] && [ "${ssh_user}" != "root" ] && [ "${ssh_user}" != "${atlas_user}" ]; then
install_keys "${ssh_user}"
fi
fi
if [ -s "${sudoers_file}" ]; then
install -d -m 755 /etc/sudoers.d
install -m 440 "${sudoers_file}" /etc/sudoers.d/90-hecate-atlas
if command -v visudo >/dev/null 2>&1; then
visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null 2>&1 || rm -f /etc/sudoers.d/90-hecate-atlas
fi
fi
systemctl restart ssh.service >/dev/null 2>&1 || systemctl restart sshd.service >/dev/null 2>&1 || systemctl restart ssh.socket >/dev/null 2>&1 || true
touch "${marker}"
`
}
func sshPasswordConfigContent(sec *secrets.NodeSecrets) string {
if !hasNodePasswords(sec) {
return ""
}
return "PasswordAuthentication yes\nKbdInteractiveAuthentication no\nChallengeResponseAuthentication no\nPermitRootLogin yes\nUsePAM yes\n"
}
func hasNodePasswords(sec *secrets.NodeSecrets) bool {
if sec == nil {
return false
}
return effectiveAtlasPassword(sec) != "" || effectiveAtlasPasswordHash(sec) != "" || firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != ""
}
func effectiveAtlasPassword(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPassword, sec.SSHPassword)
}
func effectiveAtlasPasswordHash(sec *secrets.NodeSecrets) string {
if sec == nil {
return ""
}
return firstNonEmptyString(sec.AtlasPasswordHash, sec.SSHPasswordHash)
}
func firstNonEmptyString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
func redactedSecretsForImage(sec *secrets.NodeSecrets) map[string]any {
if sec == nil {
return nil
}
debug := map[string]any{
"has_ssh_password": firstNonEmptyString(sec.SSHPassword, sec.SSHPasswordHash) != "",
"has_atlas_password": firstNonEmptyString(sec.AtlasPassword, sec.AtlasPasswordHash) != "",
"has_root_password": firstNonEmptyString(sec.RootPassword, sec.RootPasswordHash) != "",
"has_k3s_token": strings.TrimSpace(sec.K3sToken) != "",
"has_cloud_init_override": strings.TrimSpace(sec.CloudInit) != "",
}
if len(sec.Extra) > 0 {
keys := make([]string, 0, len(sec.Extra))
for key := range sec.Extra {
key = strings.TrimSpace(key)
if key == "" {
continue
}
keys = append(keys, key)
}
sort.Strings(keys)
debug["extra_keys"] = keys
}
return debug
}
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}

133
pkg/plan/node_metadata.go Normal file
View File

@ -0,0 +1,133 @@
package plan
import (
"context"
"encoding/json"
"os"
"sort"
"strings"
"metis/pkg/config"
"metis/pkg/secrets"
)
func fetchSecrets(hostname string) *secrets.NodeSecrets {
envSecrets := nodeSecretsFromEnv()
if os.Getenv("VAULT_ADDR") == "" {
return envSecrets
}
cli := secrets.NewFromEnv()
sec, err := cli.FetchNode(context.Background(), hostname)
if err != nil {
return envSecrets
}
return mergeNodeSecrets(sec, envSecrets)
}
func nodeSecretsFromEnv() *secrets.NodeSecrets {
sec := &secrets.NodeSecrets{
SSHPassword: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD")),
SSHPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_SSH_PASSWORD_HASH")),
AtlasPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD")),
AtlasPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ATLAS_PASSWORD_HASH")),
RootPassword: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD")),
RootPasswordHash: strings.TrimSpace(os.Getenv("METIS_NODE_ROOT_PASSWORD_HASH")),
}
if sec.SSHPassword == "" && sec.SSHPasswordHash == "" && sec.AtlasPassword == "" && sec.AtlasPasswordHash == "" && sec.RootPassword == "" && sec.RootPasswordHash == "" {
return nil
}
return sec
}
func mergeNodeSecrets(base, override *secrets.NodeSecrets) *secrets.NodeSecrets {
if base == nil {
return override
}
if override == nil {
return base
}
merged := *base
merged.SSHPassword = firstNonEmptyString(override.SSHPassword, base.SSHPassword)
merged.SSHPasswordHash = firstNonEmptyString(override.SSHPasswordHash, base.SSHPasswordHash)
merged.AtlasPassword = firstNonEmptyString(override.AtlasPassword, base.AtlasPassword)
merged.AtlasPasswordHash = firstNonEmptyString(override.AtlasPasswordHash, base.AtlasPasswordHash)
merged.RootPassword = firstNonEmptyString(override.RootPassword, base.RootPassword)
merged.RootPasswordHash = firstNonEmptyString(override.RootPasswordHash, base.RootPasswordHash)
merged.K3sToken = firstNonEmptyString(override.K3sToken, base.K3sToken)
merged.CloudInit = firstNonEmptyString(override.CloudInit, base.CloudInit)
if len(base.Extra) > 0 || len(override.Extra) > 0 {
merged.Extra = map[string]string{}
for key, value := range base.Extra {
merged.Extra[key] = value
}
for key, value := range override.Extra {
merged.Extra[key] = value
}
}
return &merged
}
func applyNodeMetadataEnv(cfg *config.NodeConfig) {
if cfg == nil {
return
}
if labels := parseEnvJSONMap(os.Getenv("METIS_NODE_LABELS_JSON")); len(labels) > 0 {
if cfg.Labels == nil {
cfg.Labels = map[string]string{}
}
for key, value := range labels {
cfg.Labels[key] = value
}
cfg.K3s.Labels = cfg.Labels
}
if taints := parseEnvJSONList(os.Getenv("METIS_NODE_TAINTS_JSON")); len(taints) > 0 {
cfg.Taints = uniqueStrings(append(cfg.Taints, taints...))
cfg.K3s.Taints = append([]string{}, cfg.Taints...)
}
}
func parseEnvJSONMap(raw string) map[string]string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values map[string]string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func parseEnvJSONList(raw string) []string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
var values []string
if err := json.Unmarshal([]byte(raw), &values); err != nil {
return nil
}
return values
}
func uniqueStrings(values []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
return out
}
func jsonMarshalIndent(value any) ([]byte, error) {
return json.MarshalIndent(value, "", " ")
}

View File

@ -0,0 +1,127 @@
package plan
import (
"reflect"
"strings"
"testing"
"metis/pkg/config"
"metis/pkg/secrets"
)
func TestNodeSecretHelpers(t *testing.T) {
if got := effectiveAtlasPassword(nil); got != "" {
t.Fatalf("effectiveAtlasPassword(nil) = %q", got)
}
if got := effectiveAtlasPasswordHash(nil); got != "" {
t.Fatalf("effectiveAtlasPasswordHash(nil) = %q", got)
}
sec := &secrets.NodeSecrets{SSHPassword: "ssh-pass", SSHPasswordHash: "$ssh$hash"}
if got := effectiveAtlasPassword(sec); got != "ssh-pass" {
t.Fatalf("effectiveAtlasPassword fallback = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$ssh$hash" {
t.Fatalf("effectiveAtlasPasswordHash fallback = %q", got)
}
sec.AtlasPassword = "atlas-pass"
sec.AtlasPasswordHash = "$atlas$hash"
if got := effectiveAtlasPassword(sec); got != "atlas-pass" {
t.Fatalf("effectiveAtlasPassword explicit = %q", got)
}
if got := effectiveAtlasPasswordHash(sec); got != "$atlas$hash" {
t.Fatalf("effectiveAtlasPasswordHash explicit = %q", got)
}
if got := firstNonEmptyString("", " value ", "ignored"); got != "value" {
t.Fatalf("firstNonEmptyString = %q", got)
}
if !hasNodePasswords(&secrets.NodeSecrets{RootPasswordHash: "$root$hash"}) {
t.Fatal("expected root password hash to count as password material")
}
if hasNodePasswords(&secrets.NodeSecrets{}) {
t.Fatal("empty node secrets should not count as password material")
}
debug := redactedSecretsForImage(&secrets.NodeSecrets{Extra: map[string]string{"b": "2", "a": "1"}})
if !reflect.DeepEqual(debug["extra_keys"], []string{"a", "b"}) {
t.Fatalf("redactedSecretsForImage extra_keys = %#v", debug)
}
}
func TestNodeSecretsFromEnvAndMergeNodeSecrets(t *testing.T) {
t.Setenv("METIS_NODE_SSH_PASSWORD", "ssh-pass")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "$ssh$hash")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "atlas-pass")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "$atlas$hash")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "root-pass")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "$root$hash")
envSecrets := nodeSecretsFromEnv()
if envSecrets == nil || envSecrets.RootPassword != "root-pass" || envSecrets.AtlasPasswordHash != "$atlas$hash" {
t.Fatalf("nodeSecretsFromEnv = %#v", envSecrets)
}
merged := mergeNodeSecrets(&secrets.NodeSecrets{
SSHPassword: "base-ssh",
K3sToken: "base-token",
CloudInit: "base-cloud",
Extra: map[string]string{"base": "1"},
}, &secrets.NodeSecrets{
AtlasPassword: "override-atlas",
RootPassword: "override-root",
K3sToken: "override-token",
CloudInit: "override-cloud",
Extra: map[string]string{"override": "2"},
})
if merged.K3sToken != "override-token" || merged.CloudInit != "override-cloud" || merged.AtlasPassword != "override-atlas" || merged.RootPassword != "override-root" {
t.Fatalf("mergeNodeSecrets = %#v", merged)
}
if merged.Extra["base"] != "1" || merged.Extra["override"] != "2" {
t.Fatalf("mergeNodeSecrets extras = %#v", merged.Extra)
}
if got := mergeNodeSecrets(nil, envSecrets); got.RootPasswordHash != "$root$hash" {
t.Fatalf("mergeNodeSecrets nil base = %#v", got)
}
if got := mergeNodeSecrets(envSecrets, nil); got.SSHPassword != "ssh-pass" {
t.Fatalf("mergeNodeSecrets nil override = %#v", got)
}
t.Setenv("METIS_NODE_SSH_PASSWORD", "")
t.Setenv("METIS_NODE_SSH_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD", "")
t.Setenv("METIS_NODE_ATLAS_PASSWORD_HASH", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD", "")
t.Setenv("METIS_NODE_ROOT_PASSWORD_HASH", "")
if got := nodeSecretsFromEnv(); got != nil {
t.Fatalf("expected empty env secrets to collapse to nil, got %#v", got)
}
}
func TestFirstbootEnvContentIncludesHashes(t *testing.T) {
cfg := &config.NodeConfig{
Hostname: "titan-15",
SSHUser: "atlas",
K3s: config.K3sConfig{Version: "v1.31.5+k3s1"},
}
content := firstbootEnvContent(cfg, &secrets.NodeSecrets{
AtlasPasswordHash: "$atlas$hash",
RootPasswordHash: "$root$hash",
})
if !reflect.DeepEqual(parseEnvLines(content), map[string]string{
"METIS_HOSTNAME": "'titan-15'",
"METIS_SSH_USER": "'atlas'",
"METIS_ATLAS_USER": "'atlas'",
"METIS_K3S_VERSION": "'v1.31.5+k3s1'",
"METIS_ATLAS_PASSWORD_HASH": "'$atlas$hash'",
"METIS_ROOT_PASSWORD_HASH": "'$root$hash'",
}) {
t.Fatalf("firstbootEnvContent = %q", content)
}
}
func parseEnvLines(raw string) map[string]string {
result := map[string]string{}
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
continue
}
result[parts[0]] = parts[1]
}
return result
}

View File

@ -15,10 +15,15 @@ import (
// NodeSecrets holds per-node secret material to inject at burn time.
// These should live in Vault at secret/data/nodes/<hostname>.
type NodeSecrets struct {
SSHPassword string `json:"ssh_password,omitempty"`
K3sToken string `json:"k3s_token,omitempty"`
CloudInit string `json:"cloud_init,omitempty"`
Extra map[string]string `json:"extra,omitempty"`
SSHPassword string `json:"ssh_password,omitempty"`
SSHPasswordHash string `json:"ssh_password_hash,omitempty"`
AtlasPassword string `json:"atlas_password,omitempty"`
AtlasPasswordHash string `json:"atlas_password_hash,omitempty"`
RootPassword string `json:"root_password,omitempty"`
RootPasswordHash string `json:"root_password_hash,omitempty"`
K3sToken string `json:"k3s_token,omitempty"`
CloudInit string `json:"cloud_init,omitempty"`
Extra map[string]string `json:"extra,omitempty"`
}
// Client fetches node secrets from Vault using either a token or AppRole.

View File

@ -16,9 +16,11 @@ func TestFetchNodeReturnsData(t *testing.T) {
_ = json.NewEncoder(w).Encode(map[string]any{
"data": map[string]any{
"data": map[string]any{
"ssh_password": "p1",
"k3s_token": "t1",
"cloud_init": "ci",
"ssh_password": "p1",
"atlas_password_hash": "$atlas$hash",
"root_password": "root-pw",
"k3s_token": "t1",
"cloud_init": "ci",
},
},
})
@ -33,7 +35,7 @@ func TestFetchNodeReturnsData(t *testing.T) {
if err != nil {
t.Fatalf("fetch: %v", err)
}
if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
if sec.SSHPassword != "p1" || sec.AtlasPasswordHash != "$atlas$hash" || sec.RootPassword != "root-pw" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
t.Fatalf("unexpected secrets: %+v", sec)
}
}

View File

@ -117,16 +117,24 @@ type App struct {
inventory *inventory.Inventory
metrics *Metrics
mu sync.RWMutex
jobs map[string]*Job
snapshots map[string]SnapshotRecord
targets map[string]facts.Targets
artifactStore map[string]ArtifactSummary
deviceStore map[string]deviceSnapshot
mu sync.RWMutex
jobs map[string]*Job
snapshots map[string]SnapshotRecord
targets map[string]facts.Targets
artifactStore map[string]ArtifactSummary
deviceStore map[string]deviceSnapshot
desiredMetadata map[string]DesiredNodeMetadata
}
// NewApp creates a Metis service app instance.
func NewApp(settings Settings) (*App, error) {
if strings.TrimSpace(settings.DesiredMetadataPath) == "" {
baseDir := filepath.Dir(settings.SnapshotsPath)
if strings.TrimSpace(baseDir) == "" || baseDir == "." {
baseDir = filepath.Dir(settings.HistoryPath)
}
settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json")
}
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
return nil, err
}
@ -141,18 +149,20 @@ func NewApp(settings Settings) (*App, error) {
return nil, err
}
app := &App{
settings: settings,
inventory: inv,
metrics: NewMetrics(),
jobs: map[string]*Job{},
snapshots: map[string]SnapshotRecord{},
targets: map[string]facts.Targets{},
artifactStore: map[string]ArtifactSummary{},
deviceStore: map[string]deviceSnapshot{},
settings: settings,
inventory: inv,
metrics: NewMetrics(),
jobs: map[string]*Job{},
snapshots: map[string]SnapshotRecord{},
targets: map[string]facts.Targets{},
artifactStore: map[string]ArtifactSummary{},
deviceStore: map[string]deviceSnapshot{},
desiredMetadata: map[string]DesiredNodeMetadata{},
}
_ = app.loadSnapshots()
_ = app.loadTargets()
_ = app.loadArtifacts()
_ = app.loadDesiredNodeMetadata()
return app, nil
}
@ -289,6 +299,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
if err := a.syncScratchAnnotations(record); err != nil {
a.appendEvent(annotationSyncEvent(record.Node, err))
}
if err := a.syncDesiredNodeMetadata(record); err != nil {
a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err))
}
a.appendEvent(Event{
Time: record.CollectedAt,
Kind: "sentinel.snapshot",

View File

@ -22,6 +22,9 @@ type clusterNode struct {
Worker bool
ControlPlane bool
Unschedulable bool
Labels map[string]string
Annotations map[string]string
Taints []string
USBScratchStatus string
USBScratchManagedPaths string
}
@ -179,6 +182,11 @@ func clusterNodes() []clusterNode {
} `json:"metadata"`
Spec struct {
Unschedulable bool `json:"unschedulable"`
Taints []struct {
Key string `json:"key"`
Value string `json:"value"`
Effect string `json:"effect"`
} `json:"taints"`
} `json:"spec"`
} `json:"items"`
}
@ -189,6 +197,28 @@ func clusterNodes() []clusterNode {
for _, item := range payload.Items {
labels := item.Metadata.Labels
annotations := item.Metadata.Annotations
if labels == nil {
labels = map[string]string{}
}
if annotations == nil {
annotations = map[string]string{}
}
taints := make([]string, 0, len(item.Spec.Taints))
for _, taint := range item.Spec.Taints {
key := strings.TrimSpace(taint.Key)
if key == "" {
continue
}
raw := key
if value := strings.TrimSpace(taint.Value); value != "" {
raw += "=" + value
}
if effect := strings.TrimSpace(taint.Effect); effect != "" {
raw += ":" + effect
}
taints = append(taints, raw)
}
sort.Strings(taints)
nodes = append(nodes, clusterNode{
Name: strings.TrimSpace(item.Metadata.Name),
Arch: strings.TrimSpace(labels["kubernetes.io/arch"]),
@ -196,6 +226,9 @@ func clusterNodes() []clusterNode {
Worker: labels["node-role.kubernetes.io/worker"] == "true",
ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "",
Unschedulable: item.Spec.Unschedulable,
Labels: labels,
Annotations: annotations,
Taints: taints,
USBScratchStatus: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-status"]),
USBScratchManagedPaths: strings.TrimSpace(annotations["maintenance.bstein.dev/usb-scratch-managed-paths"]),
})

View File

@ -37,6 +37,11 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
t.Fatal("expected persistTargets to fail when parent is a file")
}
app.settings.DesiredMetadataPath = filepath.Join(fileParent, "desired-node-metadata.json")
if err := app.persistDesiredNodeMetadata(); err == nil {
t.Fatal("expected persistDesiredNodeMetadata to fail when parent is a file")
}
invalidArtifactState := filepath.Join(t.TempDir(), "artifacts.json")
if err := os.WriteFile(invalidArtifactState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err)
@ -45,6 +50,15 @@ func TestServiceArtifactAndSnapshotPersistenceErrorBranches(t *testing.T) {
if err := app.loadArtifacts(); err == nil {
t.Fatal("expected loadArtifacts to reject invalid json")
}
invalidDesiredState := filepath.Join(t.TempDir(), "desired-node-metadata.json")
if err := os.WriteFile(invalidDesiredState, []byte("{bad-json"), 0o644); err != nil {
t.Fatal(err)
}
app.settings.DesiredMetadataPath = invalidDesiredState
if err := app.loadDesiredNodeMetadata(); err == nil {
t.Fatal("expected loadDesiredNodeMetadata to reject invalid json")
}
}
func TestServiceReplacementAndDeviceBranches(t *testing.T) {

View File

@ -162,6 +162,7 @@ nodes:
snapshotsPath := filepath.Join(dir, "snapshots.json")
targetsPath := filepath.Join(dir, "targets.json")
artifactStatePath := filepath.Join(dir, "artifacts.json")
desiredMetadataPath := filepath.Join(dir, "desired-node-metadata.json")
seedSnapshots := map[string]SnapshotRecord{
"titan-15": {
@ -190,19 +191,33 @@ nodes:
if err := os.WriteFile(artifactStatePath, data, 0o644); err != nil {
t.Fatal(err)
}
seedDesiredMetadata := map[string]DesiredNodeMetadata{
"titan-15": {
Node: "titan-15",
Hostname: "titan-15",
CapturedAt: testTime(t),
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
},
}
data, _ = json.MarshalIndent(seedDesiredMetadata, "", " ")
if err := os.WriteFile(desiredMetadataPath, data, 0o644); err != nil {
t.Fatal(err)
}
app, err := NewApp(Settings{
InventoryPath: invPath,
CacheDir: filepath.Join(dir, "cache"),
ArtifactDir: filepath.Join(dir, "artifacts"),
ArtifactStatePath: artifactStatePath,
HistoryPath: filepath.Join(dir, "history.jsonl"),
SnapshotsPath: snapshotsPath,
TargetsPath: targetsPath,
DefaultFlashHost: "titan-22",
FlashHosts: []string{"titan-22"},
LocalHost: "titan-22",
AllowedGroups: []string{"admin"},
InventoryPath: invPath,
CacheDir: filepath.Join(dir, "cache"),
ArtifactDir: filepath.Join(dir, "artifacts"),
ArtifactStatePath: artifactStatePath,
HistoryPath: filepath.Join(dir, "history.jsonl"),
SnapshotsPath: snapshotsPath,
TargetsPath: targetsPath,
DesiredMetadataPath: desiredMetadataPath,
DefaultFlashHost: "titan-22",
FlashHosts: []string{"titan-22"},
LocalHost: "titan-22",
AllowedGroups: []string{"admin"},
})
if err != nil {
t.Fatalf("NewApp: %v", err)
@ -211,6 +226,9 @@ nodes:
if got := app.artifacts()["titan-15"].Ref; got != "reg/proj/titan-15:latest" {
t.Fatalf("artifacts() = %q", got)
}
if desired, ok := app.desiredMetadataForNode("titan-15"); !ok || desired.Labels["hardware"] != "rpi5" {
t.Fatalf("desiredMetadataForNode() = %#v ok=%v", desired, ok)
}
if err := app.recordArtifact(ArtifactSummary{Node: "titan-15", Ref: "reg/proj/titan-15:v2"}); err != nil {
t.Fatalf("recordArtifact: %v", err)
}

View File

@ -0,0 +1,483 @@
package service
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"metis/pkg/config"
)
// DesiredNodeMetadata captures the node identity Metis should preserve through
// recovery builds and re-assert after the node rejoins the cluster.
type DesiredNodeMetadata struct {
Node string `json:"node"`
Hostname string `json:"hostname,omitempty"`
CapturedAt time.Time `json:"captured_at,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Taints []string `json:"taints,omitempty"`
Unschedulable bool `json:"unschedulable,omitempty"`
}
func (a *App) loadDesiredNodeMetadata() error {
data, err := os.ReadFile(a.settings.DesiredMetadataPath)
if err != nil {
return err
}
var desired map[string]DesiredNodeMetadata
if err := json.Unmarshal(data, &desired); err != nil {
return err
}
a.mu.Lock()
a.desiredMetadata = desired
a.mu.Unlock()
return nil
}
func (a *App) persistDesiredNodeMetadata() error {
a.mu.RLock()
data, err := json.MarshalIndent(a.desiredMetadata, "", " ")
a.mu.RUnlock()
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(a.settings.DesiredMetadataPath), 0o755); err != nil {
return err
}
return os.WriteFile(a.settings.DesiredMetadataPath, data, 0o644)
}
func (a *App) desiredMetadataForNode(node string) (DesiredNodeMetadata, bool) {
node = strings.TrimSpace(node)
if node == "" {
return DesiredNodeMetadata{}, false
}
a.mu.RLock()
defer a.mu.RUnlock()
desired, ok := a.desiredMetadata[node]
if !ok {
return DesiredNodeMetadata{}, false
}
return cloneDesiredNodeMetadata(desired), true
}
func (a *App) stageDesiredNodeMetadata(nodeName string) (DesiredNodeMetadata, error) {
nodeName = strings.TrimSpace(nodeName)
if nodeName == "" {
return DesiredNodeMetadata{}, fmt.Errorf("node metadata requires a node name")
}
nodeSpec, _, err := a.inventory.FindNode(nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
cfg, err := config.Build(a.inventory, nodeName)
if err != nil {
return DesiredNodeMetadata{}, err
}
desired := DesiredNodeMetadata{
Node: nodeName,
Hostname: strings.TrimSpace(nodeSpec.Hostname),
CapturedAt: time.Now().UTC(),
Labels: filteredRestorableLabels(cfg.Labels),
Taints: restorableTaints(cfg.Taints),
}
if existing, ok := a.desiredMetadataForNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, existing)
}
if live, ok := liveClusterNode(nodeName); ok {
desired = mergeDesiredNodeMetadata(desired, desiredMetadataFromCluster(*live))
}
desired.Labels = normalizeStringMap(desired.Labels)
desired.Annotations = normalizeStringMap(desired.Annotations)
desired.Taints = normalizeTaints(desired.Taints)
a.mu.Lock()
if a.desiredMetadata == nil {
a.desiredMetadata = map[string]DesiredNodeMetadata{}
}
a.desiredMetadata[nodeName] = desired
a.mu.Unlock()
if err := a.persistDesiredNodeMetadata(); err != nil {
return DesiredNodeMetadata{}, err
}
return cloneDesiredNodeMetadata(desired), nil
}
func (a *App) syncDesiredNodeMetadata(record SnapshotRecord) error {
desired, ok := a.desiredMetadataForNode(record.Node)
if !ok {
return nil
}
live, ok := liveClusterNode(record.Node)
if !ok {
return nil
}
return patchDesiredNodeMetadata(*live, desired)
}
func desiredMetadataFromCluster(node clusterNode) DesiredNodeMetadata {
return DesiredNodeMetadata{
Node: strings.TrimSpace(node.Name),
Labels: filteredRestorableLabels(node.Labels),
Annotations: filteredRestorableAnnotations(node.Annotations),
Taints: restorableTaints(node.Taints),
Unschedulable: node.Unschedulable,
}
}
func mergeDesiredNodeMetadata(base, overlay DesiredNodeMetadata) DesiredNodeMetadata {
merged := cloneDesiredNodeMetadata(base)
if hostname := strings.TrimSpace(overlay.Hostname); hostname != "" {
merged.Hostname = hostname
}
if !overlay.CapturedAt.IsZero() {
merged.CapturedAt = overlay.CapturedAt
}
if merged.Labels == nil {
merged.Labels = map[string]string{}
}
for key, value := range overlay.Labels {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Labels[key] = strings.TrimSpace(value)
}
if merged.Annotations == nil {
merged.Annotations = map[string]string{}
}
for key, value := range overlay.Annotations {
if key = strings.TrimSpace(key); key == "" {
continue
}
merged.Annotations[key] = strings.TrimSpace(value)
}
if len(overlay.Taints) > 0 {
merged.Taints = normalizeTaints(overlay.Taints)
}
merged.Unschedulable = overlay.Unschedulable
return merged
}
func patchDesiredNodeMetadata(live clusterNode, desired DesiredNodeMetadata) error {
node := strings.TrimSpace(desired.Node)
if node == "" {
node = strings.TrimSpace(live.Name)
}
if node == "" {
return nil
}
labelPatch := metadataStringPatch(live.Labels, desired.Labels, isRestorableLabel)
annotationPatch := metadataStringPatch(live.Annotations, desired.Annotations, isRestorableAnnotation)
mergedTaints := mergeLiveAndDesiredTaints(live.Taints, desired.Taints)
body := map[string]any{}
metadata := map[string]any{}
if len(labelPatch) > 0 {
metadata["labels"] = labelPatch
}
if len(annotationPatch) > 0 {
metadata["annotations"] = annotationPatch
}
if len(metadata) > 0 {
body["metadata"] = metadata
}
spec := map[string]any{}
if live.Unschedulable != desired.Unschedulable {
spec["unschedulable"] = desired.Unschedulable
}
if !sameTaints(live.Taints, mergedTaints) {
spec["taints"] = taintPatchPayload(mergedTaints)
}
if len(spec) > 0 {
body["spec"] = spec
}
if len(body) == 0 {
return nil
}
kube, err := kubeClientFactory()
if err != nil {
return err
}
return kube.mergePatch("/api/v1/nodes/"+node, body)
}
func metadataStringPatch(live, desired map[string]string, allow func(string) bool) map[string]any {
patch := map[string]any{}
for key, value := range desired {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
value = strings.TrimSpace(value)
if strings.TrimSpace(live[key]) != value {
patch[key] = value
}
}
for key := range live {
key = strings.TrimSpace(key)
if key == "" || !allow(key) {
continue
}
if _, ok := desired[key]; !ok {
patch[key] = nil
}
}
return patch
}
func liveClusterNode(node string) (*clusterNode, bool) {
node = strings.TrimSpace(node)
if node == "" {
return nil, false
}
for _, live := range clusterNodes() {
if strings.TrimSpace(live.Name) == node {
copyNode := live
return &copyNode, true
}
}
return nil, false
}
func filteredRestorableLabels(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableLabel(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func filteredRestorableAnnotations(values map[string]string) map[string]string {
filtered := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" || !isRestorableAnnotation(key) {
continue
}
filtered[key] = strings.TrimSpace(value)
}
return filtered
}
func normalizeStringMap(values map[string]string) map[string]string {
if len(values) == 0 {
return nil
}
normalized := map[string]string{}
for key, value := range values {
key = strings.TrimSpace(key)
if key == "" {
continue
}
normalized[key] = strings.TrimSpace(value)
}
if len(normalized) == 0 {
return nil
}
return normalized
}
func restorableTaints(values []string) []string {
filtered := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" || !isRestorableTaint(value) {
continue
}
filtered = append(filtered, value)
}
return normalizeTaints(filtered)
}
func normalizeTaints(values []string) []string {
if len(values) == 0 {
return nil
}
seen := map[string]struct{}{}
out := make([]string, 0, len(values))
for _, value := range values {
value = normalizeTaint(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
sort.Strings(out)
if len(out) == 0 {
return nil
}
return out
}
func normalizeTaint(value string) string {
return strings.TrimSpace(value)
}
func sameTaints(left, right []string) bool {
left = normalizeTaints(left)
right = normalizeTaints(right)
if len(left) != len(right) {
return false
}
for idx := range left {
if left[idx] != right[idx] {
return false
}
}
return true
}
func mergeLiveAndDesiredTaints(live, desired []string) []string {
merged := make([]string, 0, len(live)+len(desired))
for _, taint := range live {
taint = normalizeTaint(taint)
if taint == "" || isRestorableTaint(taint) {
continue
}
merged = append(merged, taint)
}
merged = append(merged, restorableTaints(desired)...)
return normalizeTaints(merged)
}
func taintPatchPayload(values []string) []map[string]string {
payload := make([]map[string]string, 0, len(values))
for _, value := range normalizeTaints(values) {
key, taintValue, effect := splitTaint(value)
if key == "" {
continue
}
entry := map[string]string{"key": key}
if taintValue != "" {
entry["value"] = taintValue
}
if effect != "" {
entry["effect"] = effect
}
payload = append(payload, entry)
}
return payload
}
func splitTaint(raw string) (string, string, string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", "", ""
}
effect := ""
body := raw
if idx := strings.LastIndex(raw, ":"); idx >= 0 {
body = strings.TrimSpace(raw[:idx])
effect = strings.TrimSpace(raw[idx+1:])
}
key := body
value := ""
if idx := strings.Index(body, "="); idx >= 0 {
key = strings.TrimSpace(body[:idx])
value = strings.TrimSpace(body[idx+1:])
}
return strings.TrimSpace(key), value, effect
}
func isRestorableTaint(raw string) bool {
key, _, _ := splitTaint(raw)
if key == "" {
return false
}
for _, prefix := range []string{
"node.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
"ToBeDeletedByClusterAutoscaler",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableLabel(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
if strings.HasPrefix(key, "node-role.kubernetes.io/") {
return true
}
for _, prefix := range []string{
"kubernetes.io/",
"beta.kubernetes.io/",
"node.kubernetes.io/",
"topology.kubernetes.io/",
"feature.node.kubernetes.io/",
"failure-domain.beta.kubernetes.io/",
"nvidia.com/",
"k3s.io/",
"rke2.io/",
"volumes.kubernetes.io/",
"node.cloudprovider.kubernetes.io/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func isRestorableAnnotation(key string) bool {
key = strings.TrimSpace(key)
if key == "" {
return false
}
for _, prefix := range []string{
"kubectl.kubernetes.io/",
"kubeadm.alpha.kubernetes.io/",
"kubernetes.io/",
"node.alpha.kubernetes.io/",
"node.kubernetes.io/",
"volumes.kubernetes.io/",
"csi.volume.kubernetes.io/",
"csi.storage.k8s.io/",
"flannel.alpha.coreos.com/",
"projectcalico.org/",
"rke2.io/",
"k3s.io/",
"nvidia.com/",
} {
if strings.HasPrefix(key, prefix) {
return false
}
}
return true
}
func cloneDesiredNodeMetadata(value DesiredNodeMetadata) DesiredNodeMetadata {
clone := value
clone.Labels = normalizeStringMap(value.Labels)
clone.Annotations = normalizeStringMap(value.Annotations)
clone.Taints = normalizeTaints(value.Taints)
return clone
}
func desiredNodeMetadataSyncEvent(node string, err error) Event {
return Event{
Time: time.Now().UTC(),
Kind: "sentinel.node-metadata",
Summary: fmt.Sprintf("Could not restore desired node metadata for %s", node),
Details: map[string]any{
"node": node,
"error": err.Error(),
},
}
}

View File

@ -0,0 +1,254 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"
"metis/pkg/sentinel"
)
func TestStageDesiredNodeMetadataMergesInventoryAndLiveCluster(t *testing.T) {
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi5",
"rack": "a1",
"maintenance.bstein.dev/color": "blue",
"kubernetes.io/arch": "arm64",
"node-role.kubernetes.io/worker": "true",
},
"annotations": map[string]string{
"maintenance.bstein.dev/owner": "atlas",
"volumes.kubernetes.io/controller-managed-attach-detach": "true",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "recovery", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.inventory.Nodes[0].Labels = map[string]string{"hardware": "rpi4", "rack": "a1"}
app.inventory.Nodes[0].Taints = []string{"flash=true:NoSchedule"}
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Annotations: map[string]string{"maintenance.bstein.dev/legacy": "keep"},
}
desired, err := app.stageDesiredNodeMetadata("titan-15")
if err != nil {
t.Fatalf("stageDesiredNodeMetadata: %v", err)
}
if desired.Hostname != "titan-15" || !desired.Unschedulable {
t.Fatalf("unexpected desired metadata header: %#v", desired)
}
if desired.Labels["hardware"] != "rpi5" || desired.Labels["rack"] != "a1" || desired.Labels["maintenance.bstein.dev/color"] != "blue" {
t.Fatalf("unexpected desired labels: %#v", desired.Labels)
}
if _, ok := desired.Labels["kubernetes.io/arch"]; ok {
t.Fatalf("system labels should not be persisted: %#v", desired.Labels)
}
if desired.Annotations["maintenance.bstein.dev/owner"] != "atlas" || desired.Annotations["maintenance.bstein.dev/legacy"] != "keep" {
t.Fatalf("unexpected desired annotations: %#v", desired.Annotations)
}
if _, ok := desired.Annotations["volumes.kubernetes.io/controller-managed-attach-detach"]; ok {
t.Fatalf("controller annotations should not be persisted: %#v", desired.Annotations)
}
if !reflect.DeepEqual(desired.Taints, []string{"dedicated=recovery:NoSchedule"}) {
t.Fatalf("unexpected desired taints: %#v", desired.Taints)
}
data, err := os.ReadFile(app.settings.DesiredMetadataPath)
if err != nil {
t.Fatalf("read desired metadata file: %v", err)
}
if !strings.Contains(string(data), "titan-15") {
t.Fatalf("desired metadata file missing titan-15: %s", string(data))
}
}
func TestStoreSnapshotRestoresDesiredNodeMetadata(t *testing.T) {
var patchBody map[string]any
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-15",
"labels": map[string]string{
"hardware": "rpi4",
"maintenance.bstein.dev/old": "1",
},
"annotations": map[string]string{
"maintenance.bstein.dev/mode": "old",
},
},
"spec": map[string]any{
"unschedulable": true,
"taints": []any{
map[string]any{"key": "dedicated", "value": "old", "effect": "NoSchedule"},
map[string]any{"key": "node.kubernetes.io/unreachable", "effect": "NoExecute"},
},
},
},
},
})
case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/nodes/titan-15":
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
t.Fatalf("decode patch: %v", err)
}
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.desiredMetadata["titan-15"] = DesiredNodeMetadata{
Node: "titan-15",
Hostname: "titan-15",
Labels: map[string]string{"hardware": "rpi5"},
Annotations: map[string]string{"maintenance.bstein.dev/mode": "recovery"},
Taints: []string{"dedicated=recovery:NoSchedule"},
Unschedulable: false,
}
if err := app.StoreSnapshot(SnapshotRecord{
Node: "titan-15",
CollectedAt: time.Date(2026, 4, 24, 6, 0, 0, 0, time.UTC),
Snapshot: sentinel.Snapshot{Hostname: "titan-15"},
}); err != nil {
t.Fatalf("StoreSnapshot: %v", err)
}
if patchBody == nil {
t.Fatal("expected desired metadata patch")
}
metadata := patchBody["metadata"].(map[string]any)
labels := metadata["labels"].(map[string]any)
if labels["hardware"] != "rpi5" || labels["maintenance.bstein.dev/old"] != nil {
t.Fatalf("unexpected label patch: %#v", labels)
}
annotations := metadata["annotations"].(map[string]any)
if annotations["maintenance.bstein.dev/mode"] != "recovery" {
t.Fatalf("unexpected annotation patch: %#v", annotations)
}
spec := patchBody["spec"].(map[string]any)
if spec["unschedulable"] != false {
t.Fatalf("unexpected spec patch: %#v", spec)
}
taints := spec["taints"].([]any)
if len(taints) != 2 {
t.Fatalf("unexpected taint payload: %#v", taints)
}
entries := map[string]map[string]any{}
for _, raw := range taints {
entry := raw.(map[string]any)
key := entry["key"].(string)
entries[key] = entry
}
if entries["dedicated"]["value"] != "recovery" || entries["dedicated"]["effect"] != "NoSchedule" {
t.Fatalf("missing desired taint replacement: %#v", entries)
}
if entries["node.kubernetes.io/unreachable"]["effect"] != "NoExecute" {
t.Fatalf("system taint should be preserved: %#v", entries)
}
}
func TestDesiredNodeMetadataHelpers(t *testing.T) {
app := newTestApp(t)
if _, ok := app.desiredMetadataForNode("missing"); ok {
t.Fatal("expected no desired metadata for missing node")
}
if err := app.syncDesiredNodeMetadata(SnapshotRecord{Node: "missing"}); err != nil {
t.Fatalf("syncDesiredNodeMetadata missing should noop: %v", err)
}
if _, ok := liveClusterNode(""); ok {
t.Fatal("empty liveClusterNode lookup should fail")
}
if !isRestorableLabel("maintenance.bstein.dev/role") || isRestorableLabel("kubernetes.io/arch") {
t.Fatal("unexpected label restoration filter")
}
if !isRestorableAnnotation("maintenance.bstein.dev/state") || isRestorableAnnotation("volumes.kubernetes.io/foo") {
t.Fatal("unexpected annotation restoration filter")
}
if !isRestorableTaint("dedicated=recovery:NoSchedule") || isRestorableTaint("node.kubernetes.io/not-ready:NoExecute") {
t.Fatal("unexpected taint restoration filter")
}
key, value, effect := splitTaint("dedicated=recovery:NoSchedule")
if key != "dedicated" || value != "recovery" || effect != "NoSchedule" {
t.Fatalf("splitTaint mismatch: %q %q %q", key, value, effect)
}
if key, value, effect := splitTaint("just-a-key"); key != "just-a-key" || value != "" || effect != "" {
t.Fatalf("splitTaint key-only mismatch: %q %q %q", key, value, effect)
}
labels := filteredRestorableLabels(map[string]string{"hardware": "rpi5", "kubernetes.io/arch": "arm64"})
if !reflect.DeepEqual(labels, map[string]string{"hardware": "rpi5"}) {
t.Fatalf("filteredRestorableLabels = %#v", labels)
}
annotations := filteredRestorableAnnotations(map[string]string{"maintenance.bstein.dev/state": "ok", "volumes.kubernetes.io/foo": "bar"})
if !reflect.DeepEqual(annotations, map[string]string{"maintenance.bstein.dev/state": "ok"}) {
t.Fatalf("filteredRestorableAnnotations = %#v", annotations)
}
patch := metadataStringPatch(
map[string]string{"hardware": "rpi4", "maintenance.bstein.dev/old": "1"},
map[string]string{"hardware": "rpi5"},
isRestorableLabel,
)
if patch["hardware"] != "rpi5" || patch["maintenance.bstein.dev/old"] != nil {
t.Fatalf("metadataStringPatch = %#v", patch)
}
mergedTaints := mergeLiveAndDesiredTaints(
[]string{"node.kubernetes.io/unreachable:NoExecute", "dedicated=old:NoSchedule"},
[]string{"dedicated=new:NoSchedule", "dedicated=new:NoSchedule"},
)
if !reflect.DeepEqual(mergedTaints, []string{"dedicated=new:NoSchedule", "node.kubernetes.io/unreachable:NoExecute"}) {
t.Fatalf("mergeLiveAndDesiredTaints = %#v", mergedTaints)
}
payload := taintPatchPayload([]string{"dedicated=new:NoSchedule"})
if len(payload) != 1 || payload[0]["key"] != "dedicated" || payload[0]["value"] != "new" || payload[0]["effect"] != "NoSchedule" {
t.Fatalf("taintPatchPayload = %#v", payload)
}
original := DesiredNodeMetadata{Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}}
cloned := cloneDesiredNodeMetadata(original)
cloned.Labels["hardware"] = "mutated"
cloned.Taints[0] = "changed"
if original.Labels["hardware"] != "rpi5" || original.Taints[0] != "dedicated=new:NoSchedule" {
t.Fatalf("cloneDesiredNodeMetadata should deep-copy slices/maps: %#v %#v", original, cloned)
}
if err := patchDesiredNodeMetadata(
clusterNode{Name: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
DesiredNodeMetadata{Node: "titan-15", Labels: map[string]string{"hardware": "rpi5"}, Taints: []string{"dedicated=new:NoSchedule"}},
); err != nil {
t.Fatalf("patchDesiredNodeMetadata should noop when already in sync: %v", err)
}
if event := desiredNodeMetadataSyncEvent("titan-15", os.ErrPermission); event.Kind != "sentinel.node-metadata" || event.Details["node"] != "titan-15" {
t.Fatalf("desiredNodeMetadataSyncEvent = %#v", event)
}
}

View File

@ -74,12 +74,17 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
}
func (a *App) runBuild(job *Job, flash bool) {
_, class, err := a.inventory.FindNode(job.Node)
nodeSpec, class, err := a.inventory.FindNode(job.Node)
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
return
}
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
return
}
if err := a.ensureHarborProject(); err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
@ -112,7 +117,8 @@ func (a *App) runBuild(job *Job, flash bool) {
return
}
buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano())
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag))
job.Builder = builder.Name
logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, strings.TrimSpace(nodeSpec.Hostname), artifactRef, buildTag))
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
@ -183,6 +189,9 @@ func (a *App) runFlash(job *Job) {
}
func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
if _, err := a.stageDesiredNodeMetadata(job.Node); err != nil {
return RemoteFlashResult{}, err
}
a.setJob(job.ID, func(j *Job) {
j.Status = JobRunning
j.Stage = "preflight"

View File

@ -1,6 +1,7 @@
package service
import (
"encoding/json"
"fmt"
"math"
"path/filepath"
@ -246,8 +247,9 @@ func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any {
}
}
func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any {
func (a *App) remoteBuildPodSpec(name, host, image, node, nodeHostname, artifactRef, buildTag string) map[string]any {
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
desiredEnv := remoteDesiredMetadataEnv(a, node)
return map[string]any{
"apiVersion": "v1",
"kind": "Pod",
@ -255,7 +257,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
"name": name,
"namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "build"},
"annotations": vaultRuntimeAnnotations(true),
"annotations": vaultRuntimeAnnotations(true, nodeHostname),
},
"spec": map[string]any{
"restartPolicy": "Never",
@ -283,6 +285,7 @@ func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag
),
},
"securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0},
"env": desiredEnv,
"envFrom": []map[string]any{
{"configMapRef": map[string]any{"name": "metis"}},
},
@ -309,7 +312,7 @@ func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef st
"name": name,
"namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "flash"},
"annotations": vaultRuntimeAnnotations(false),
"annotations": vaultRuntimeAnnotations(false, ""),
},
"spec": map[string]any{
"restartPolicy": "Never",
@ -378,7 +381,46 @@ func mountedHostTmpDir(path string) string {
return "/host-tmp"
}
func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string {
func remoteDesiredMetadataEnv(a *App, node string) []map[string]any {
desired, ok := a.desiredMetadataForNode(node)
if !ok {
return nil
}
labelsJSON, _ := jsonMarshalStringMap(desired.Labels)
taintsJSON, _ := jsonMarshalStringSlice(desired.Taints)
env := []map[string]any{}
if labelsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_LABELS_JSON", "value": labelsJSON})
}
if taintsJSON != "" {
env = append(env, map[string]any{"name": "METIS_NODE_TAINTS_JSON", "value": taintsJSON})
}
return env
}
func jsonMarshalStringMap(values map[string]string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func jsonMarshalStringSlice(values []string) (string, error) {
if len(values) == 0 {
return "", nil
}
data, err := json.Marshal(values)
if err != nil {
return "", err
}
return string(data), nil
}
func vaultRuntimeAnnotations(includeSSHKeys bool, nodeHostname string) map[string]string {
annotations := map[string]string{
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/agent-pre-populate-only": "true",
@ -399,6 +441,19 @@ export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
{{ end }}`
}
nodeHostname = strings.TrimSpace(nodeHostname)
if nodeHostname != "" {
secretPath := fmt.Sprintf("secret/data/nodes/%s", nodeHostname)
annotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] = secretPath
annotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"] = `{{ with secret "` + secretPath + `" }}
export METIS_NODE_SSH_PASSWORD="{{ .Data.data.ssh_password }}"
export METIS_NODE_SSH_PASSWORD_HASH="{{ .Data.data.ssh_password_hash }}"
export METIS_NODE_ATLAS_PASSWORD="{{ .Data.data.atlas_password }}"
export METIS_NODE_ATLAS_PASSWORD_HASH="{{ .Data.data.atlas_password_hash }}"
export METIS_NODE_ROOT_PASSWORD="{{ .Data.data.root_password }}"
export METIS_NODE_ROOT_PASSWORD_HASH="{{ .Data.data.root_password_hash }}"
{{ end }}`
}
return annotations
@ -413,6 +468,7 @@ func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string {
if includeSSHKeys {
lines = append(lines, ". /vault/secrets/metis-ssh-env.sh")
}
lines = append(lines, "if [ -f /vault/secrets/metis-node-secrets-env.sh ]; then . /vault/secrets/metis-node-secrets-env.sh; fi")
lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...))
return strings.Join(lines, "\n")
}

View File

@ -251,8 +251,13 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
app := newTestApp(t)
app.settings.RemoteWorkspaceDir = "/var/tmp/metis-workspace"
app.settings.HostTmpDir = "/var/tmp/metis-flash-test"
app.desiredMetadata["titan-10"] = DesiredNodeMetadata{
Node: "titan-10",
Labels: map[string]string{"hardware": "rpi5"},
Taints: []string{"dedicated=recovery:NoSchedule"},
}
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "registry.example/metis/titan-10", "build-1")
buildSpec := app.remoteBuildPodSpec("metis-build-123", "titan-04", "runner:arm64", "titan-10", "titan-10", "registry.example/metis/titan-10", "build-1")
buildBody := buildSpec["spec"].(map[string]any)
buildVolumes := buildBody["volumes"].([]map[string]any)
workspaceVolume := buildVolumes[0]["hostPath"].(map[string]any)
@ -260,6 +265,17 @@ func TestRemoteWorkspaceAndHostTmpPathsPreferUsbScratchRoots(t *testing.T) {
t.Fatalf("build workspace hostPath = %v", got)
}
buildContainer := buildBody["containers"].([]map[string]any)[0]
buildEnv := buildContainer["env"].([]map[string]any)
if len(buildEnv) != 2 {
t.Fatalf("expected desired metadata env, got %#v", buildEnv)
}
metadataAnnotations := buildSpec["metadata"].(map[string]any)["annotations"].(map[string]string)
if metadataAnnotations["vault.hashicorp.com/agent-inject-secret-metis-node-secrets-env.sh"] != "secret/data/nodes/titan-10" {
t.Fatalf("unexpected node secret annotation: %#v", metadataAnnotations)
}
if !strings.Contains(metadataAnnotations["vault.hashicorp.com/agent-inject-template-metis-node-secrets-env.sh"], "METIS_NODE_ROOT_PASSWORD") {
t.Fatalf("expected node password exports in vault template: %#v", metadataAnnotations)
}
buildSecurity := buildContainer["securityContext"].(map[string]any)
if got := buildSecurity["runAsUser"]; got != 0 {
t.Fatalf("build runAsUser = %v", got)

View File

@ -15,7 +15,7 @@ func TestMountedHostTmpDirMapsConfiguredTmpPathIntoMount(t *testing.T) {
}
func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
withKeys := vaultRuntimeAnnotations(true)
withKeys := vaultRuntimeAnnotations(true, "titan-15")
template := withKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]
if !strings.Contains(template, "METIS_SSH_KEY_HECATE_TETHYS") {
t.Fatalf("expected tethys hecate key export in vault template: %q", template)
@ -24,7 +24,7 @@ func TestVaultRuntimeAnnotationsIncludeReciprocalHecateKeys(t *testing.T) {
t.Fatalf("expected db hecate key export in vault template: %q", template)
}
withoutKeys := vaultRuntimeAnnotations(false)
withoutKeys := vaultRuntimeAnnotations(false, "")
if _, ok := withoutKeys["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"]; ok {
t.Fatalf("did not expect ssh key template when includeSSHKeys=false")
}

View File

@ -11,30 +11,31 @@ var hostNameLookup = os.Hostname
// Settings configures the Metis service runtime.
type Settings struct {
BindAddr string
InventoryPath string
CacheDir string
ArtifactDir string
ArtifactStatePath string
HistoryPath string
SnapshotsPath string
TargetsPath string
DefaultFlashHost string
FlashHosts []string
LocalHost string
AllowedGroups []string
MaxDeviceBytes int64
Namespace string
RunnerImageAMD64 string
RunnerImageARM64 string
HarborRegistry string
HarborProject string
HarborAPIBase string
HarborUsername string
HarborPassword string
HostTmpDir string
RemoteWorkspaceDir string
RemotePodTimeout int64
BindAddr string
InventoryPath string
CacheDir string
ArtifactDir string
ArtifactStatePath string
HistoryPath string
SnapshotsPath string
TargetsPath string
DesiredMetadataPath string
DefaultFlashHost string
FlashHosts []string
LocalHost string
AllowedGroups []string
MaxDeviceBytes int64
Namespace string
RunnerImageAMD64 string
RunnerImageARM64 string
HarborRegistry string
HarborProject string
HarborAPIBase string
HarborUsername string
HarborPassword string
HostTmpDir string
RemoteWorkspaceDir string
RemotePodTimeout int64
}
// FromEnv builds service settings with sensible defaults for local dev and in-cluster use.
@ -44,30 +45,31 @@ func FromEnv() Settings {
defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost)
flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost))
return Settings{
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")),
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
DefaultFlashHost: defaultFlashHost,
FlashHosts: flashHosts,
LocalHost: localHost,
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")),
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"),
RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""),
RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""),
HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"),
HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"),
HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"),
HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""),
HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""),
HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"),
RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"),
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800),
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")),
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
DesiredMetadataPath: getenvDefault("METIS_DESIRED_METADATA_PATH", filepath.Join(dataDir, "desired-node-metadata.json")),
DefaultFlashHost: defaultFlashHost,
FlashHosts: flashHosts,
LocalHost: localHost,
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintenance")),
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"),
RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""),
RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""),
HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"),
HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"),
HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"),
HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""),
HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""),
HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/var/tmp/metis-flash-test"),
RemoteWorkspaceDir: getenvDefault("METIS_REMOTE_WORKSPACE_DIR", "/var/tmp/metis-workspace"),
RemotePodTimeout: getenvInt64("METIS_REMOTE_POD_TIMEOUT_SEC", 1800),
}
}