metis/pkg/service/remote_helpers.go

415 lines
13 KiB
Go

package service
import (
"fmt"
"math"
"path/filepath"
"sort"
"strings"
"time"
"metis/pkg/inventory"
)
func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
switch {
case seconds < 20:
return ramp(seconds, 0, 20, 8, 14), fmt.Sprintf("Scheduling a remote builder on %s for %s", builder, node)
case seconds < 120:
return ramp(seconds, 20, 120, 14, 30), fmt.Sprintf("Injecting %s recovery config into the base image on %s", node, builder)
case seconds < 360:
return ramp(seconds, 120, 360, 30, 58), fmt.Sprintf("Building the replacement image filesystem for %s on %s", node, builder)
case seconds < 540:
return ramp(seconds, 360, 540, 58, 70), fmt.Sprintf("Compressing the replacement image for %s before upload", node)
default:
return math.Min(76, ramp(seconds, 540, 900, 70, 76)), fmt.Sprintf("Publishing %s to Harbor and refreshing the latest tag", node)
}
}
func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
switch {
case seconds < 10:
return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
case seconds < 45:
return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host)
default:
return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host)
}
}
func prettyDeviceTarget(path string) string {
switch {
case strings.HasPrefix(path, "hosttmp://"):
return strings.TrimPrefix(path, "hosttmp://")
case strings.TrimSpace(path) == "":
return "the selected target"
default:
return path
}
}
func hostTmpHostPath(path string) string {
clean := filepath.Clean(strings.TrimSpace(path))
if clean == "" || clean == "." || clean == "/" {
return "/var/tmp/metis-flash-test"
}
return clean
}
func remoteWorkspaceHostPath(root, podName string) string {
cleanRoot := filepath.Clean(strings.TrimSpace(root))
if cleanRoot == "" || cleanRoot == "." || cleanRoot == "/" {
cleanRoot = "/var/tmp/metis-workspace"
}
if strings.TrimSpace(podName) == "" {
return cleanRoot
}
return filepath.Join(cleanRoot, podName)
}
func managedPathsContain(raw, want string) bool {
want = strings.TrimSpace(want)
if want == "" {
return false
}
for _, path := range strings.Split(raw, "_") {
if strings.TrimSpace(path) == want {
return true
}
}
return false
}
func usbScratchReadyForWorkspace(node clusterNode) bool {
return node.USBScratchStatus == "ok" && managedPathsContain(node.USBScratchManagedPaths, "/var/tmp")
}
func ramp(value, start, end, min, max float64) float64 {
if end <= start {
return max
}
if value <= start {
return min
}
if value >= end {
return max
}
return min + ((value-start)/(end-start))*(max-min)
}
func (a *App) ensureDevice(host, path string) (*Device, error) {
if strings.TrimSpace(path) == "" {
return nil, fmt.Errorf("select removable media before starting a flash run")
}
devices, err := a.RefreshDevices(host)
if err != nil {
return nil, err
}
for _, device := range devices {
if device.Path == path {
return &device, nil
}
}
return nil, fmt.Errorf("device %s is not a current flash candidate on %s", path, host)
}
func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
nodes := clusterNodes()
activeBuilds := clusterActiveRemotePodLoads(a.settings.Namespace, "build")
activeRemotePods := clusterActiveRemotePodLoads(a.settings.Namespace, "")
storageNodes := map[string]struct{}{}
for _, node := range a.inventory.Nodes {
if len(node.LonghornDisks) > 0 {
storageNodes[node.Name] = struct{}{}
}
}
type scored struct {
node clusterNode
score int
}
candidates := make([]scored, 0)
for _, node := range nodes {
if node.Arch != arch || node.Unschedulable || node.ControlPlane {
continue
}
score := 0
if node.Worker {
score += 40
}
switch arch {
case "arm64":
if node.Hardware == "rpi5" {
score += 30
}
if usbScratchReadyForWorkspace(node) {
score += 120
} else if node.USBScratchStatus == "error" {
score -= 200
} else {
score -= 80
}
if _, storage := storageNodes[node.Name]; storage {
score -= 50
}
case "amd64":
if node.Name == a.settings.DefaultFlashHost {
score += 30
}
if node.Name == "titan-24" {
score -= 10
}
}
if flashHost != "" && node.Name == flashHost {
score += 5
}
if count := activeBuilds[node.Name]; count > 0 {
score -= 100 * count
}
if count := activeRemotePods[node.Name]; count > 0 {
score -= 15 * count
}
candidates = append(candidates, scored{node: node, score: score})
}
sort.Slice(candidates, func(i, j int) bool {
if candidates[i].score != candidates[j].score {
return candidates[i].score > candidates[j].score
}
return candidates[i].node.Name < candidates[j].node.Name
})
if len(candidates) == 0 {
return clusterNode{}, fmt.Errorf("no build host available for arch %s", arch)
}
return candidates[0].node, nil
}
func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any {
return map[string]any{
"apiVersion": "v1",
"kind": "Pod",
"metadata": map[string]any{
"name": name,
"namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "devices"},
},
"spec": map[string]any{
"restartPolicy": "Never",
"serviceAccountName": "metis",
"nodeSelector": map[string]string{
"kubernetes.io/hostname": host,
},
"containers": []map[string]any{
{
"name": "remote-devices",
"image": image,
"imagePullPolicy": "Always",
"command": []string{
"metis", "remote-devices",
"--max-device-bytes", fmt.Sprintf("%d", a.settings.MaxDeviceBytes),
"--host-tmp-dir", hostTmpHostPath(a.settings.HostTmpDir),
},
"securityContext": map[string]any{"privileged": true, "runAsUser": 0},
"volumeMounts": []map[string]any{
{"name": "host-dev", "mountPath": "/dev"},
{"name": "host-sys", "mountPath": "/sys", "readOnly": true},
{"name": "host-udev", "mountPath": "/run/udev", "readOnly": true},
},
},
},
"imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}},
"volumes": []map[string]any{
{"name": "host-dev", "hostPath": map[string]any{"path": "/dev"}},
{"name": "host-sys", "hostPath": map[string]any{"path": "/sys"}},
{"name": "host-udev", "hostPath": map[string]any{"path": "/run/udev"}},
},
},
}
}
func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any {
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
return map[string]any{
"apiVersion": "v1",
"kind": "Pod",
"metadata": map[string]any{
"name": name,
"namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "build"},
"annotations": vaultRuntimeAnnotations(true),
},
"spec": map[string]any{
"restartPolicy": "Never",
"serviceAccountName": "metis",
"nodeSelector": map[string]string{
"kubernetes.io/hostname": host,
},
"containers": []map[string]any{
{
"name": "remote-build",
"image": image,
"imagePullPolicy": "Always",
"command": []string{"/bin/sh", "-c"},
"args": []string{
remoteWorkerEntrypoint(
true,
"remote-build",
"--inventory", a.settings.InventoryPath,
"--node", node,
"--cache", "/workspace/cache",
"--work-dir", "/workspace/build",
"--artifact-ref", artifactRef,
"--build-tag", buildTag,
"--harbor-registry", a.settings.HarborRegistry,
),
},
"securityContext": map[string]any{"runAsUser": 0, "runAsGroup": 0},
"envFrom": []map[string]any{
{"configMapRef": map[string]any{"name": "metis"}},
},
"volumeMounts": []map[string]any{
{"name": "workspace", "mountPath": "/workspace"},
},
},
},
"imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}},
"volumes": []map[string]any{
{"name": "workspace", "hostPath": map[string]any{"path": workspaceHostPath, "type": "DirectoryOrCreate"}},
},
},
}
}
func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef string) map[string]any {
workspaceHostPath := remoteWorkspaceHostPath(a.settings.RemoteWorkspaceDir, name)
hostTmpPath := hostTmpHostPath(a.settings.HostTmpDir)
return map[string]any{
"apiVersion": "v1",
"kind": "Pod",
"metadata": map[string]any{
"name": name,
"namespace": a.settings.Namespace,
"labels": map[string]string{"app": "metis-remote", "metis-run": "flash"},
"annotations": vaultRuntimeAnnotations(false),
},
"spec": map[string]any{
"restartPolicy": "Never",
"serviceAccountName": "metis",
"nodeSelector": map[string]string{
"kubernetes.io/hostname": host,
},
"containers": []map[string]any{
{
"name": "remote-flash",
"image": image,
"imagePullPolicy": "Always",
"command": []string{"/bin/sh", "-c"},
"args": []string{
remoteWorkerEntrypoint(
false,
"remote-flash",
"--node", node,
"--device", device,
"--artifact-ref", artifactRef,
"--work-dir", "/workspace/flash",
"--harbor-registry", a.settings.HarborRegistry,
"--host-tmp-dir", mountedHostTmpDir(a.settings.HostTmpDir),
),
},
"securityContext": map[string]any{"privileged": true, "runAsUser": 0},
"envFrom": []map[string]any{
{"configMapRef": map[string]any{"name": "metis"}},
},
"volumeMounts": []map[string]any{
{"name": "workspace", "mountPath": "/workspace"},
{"name": "host-dev", "mountPath": "/dev"},
{"name": "host-sys", "mountPath": "/sys", "readOnly": true},
{"name": "host-udev", "mountPath": "/run/udev", "readOnly": true},
{"name": "host-tmp", "mountPath": "/host-tmp"},
},
},
},
"imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}},
"volumes": []map[string]any{
{"name": "workspace", "hostPath": map[string]any{"path": workspaceHostPath, "type": "DirectoryOrCreate"}},
{"name": "host-dev", "hostPath": map[string]any{"path": "/dev"}},
{"name": "host-sys", "hostPath": map[string]any{"path": "/sys"}},
{"name": "host-udev", "hostPath": map[string]any{"path": "/run/udev"}},
{"name": "host-tmp", "hostPath": map[string]any{"path": hostTmpPath, "type": "DirectoryOrCreate"}},
},
},
}
}
func (a *App) remoteArtifactNote(node string) string {
if summary, ok := a.artifacts()[node]; ok && strings.TrimSpace(summary.Ref) != "" {
return summary.Ref
}
return a.artifactRepo(node) + ":latest"
}
func inventoryNodeArch(spec *inventory.NodeSpec, class *inventory.NodeClass) string {
if class != nil && strings.TrimSpace(class.Arch) != "" {
return strings.TrimSpace(class.Arch)
}
return "arm64"
}
func mountedHostTmpDir(path string) string {
return "/host-tmp"
}
func vaultRuntimeAnnotations(includeSSHKeys bool) map[string]string {
annotations := map[string]string{
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/agent-pre-populate-only": "true",
"vault.hashicorp.com/role": vaultRoleMaintenance,
"vault.hashicorp.com/agent-inject-secret-metis-runtime-env.sh": vaultRuntimeSecretPath,
"vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh": `{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}
export METIS_K3S_TOKEN="{{ .Data.data.k3s_token }}"
{{ end }}`,
"vault.hashicorp.com/agent-inject-secret-metis-harbor-env.sh": vaultHarborSecretPath,
"vault.hashicorp.com/agent-inject-template-metis-harbor-env.sh": `{{ with secret "kv/data/atlas/harbor/harbor-core" }}
export METIS_HARBOR_PASSWORD="{{ .Data.data.harbor_admin_password }}"
{{ end }}`,
}
if includeSSHKeys {
annotations["vault.hashicorp.com/agent-inject-secret-metis-ssh-env.sh"] = vaultSSHKeysSecretPath
annotations["vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh"] = `{{ with secret "kv/data/atlas/maintenance/metis-ssh-keys" }}
export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
{{ end }}`
}
return annotations
}
func remoteWorkerEntrypoint(includeSSHKeys bool, args ...string) string {
lines := []string{
"set -e",
". /vault/secrets/metis-runtime-env.sh",
". /vault/secrets/metis-harbor-env.sh",
}
if includeSSHKeys {
lines = append(lines, ". /vault/secrets/metis-ssh-env.sh")
}
lines = append(lines, "exec "+shellJoin(append([]string{"metis"}, args...)...))
return strings.Join(lines, "\n")
}
func shellJoin(args ...string) string {
quoted := make([]string, 0, len(args))
for _, arg := range args {
quoted = append(quoted, shellQuote(arg))
}
return strings.Join(quoted, " ")
}
func shellQuote(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}