package service import ( "fmt" "os" "path/filepath" "sort" "strings" "sync" "time" "metis/pkg/facts" "metis/pkg/inventory" "metis/pkg/sentinel" ) // JobStatus identifies the current lifecycle state of a queued job because // the UI and metrics need a stable shared vocabulary for progress updates. type JobStatus string const ( JobQueued JobStatus = "queued" JobRunning JobStatus = "running" JobDone JobStatus = "done" JobError JobStatus = "error" ) // Device describes a flashable block device. type Device struct { Name string `json:"name"` Path string `json:"path"` Model string `json:"model,omitempty"` Transport string `json:"transport,omitempty"` Type string `json:"type,omitempty"` Note string `json:"note,omitempty"` Removable bool `json:"removable"` Hotplug bool `json:"hotplug"` SizeBytes int64 `json:"size_bytes"` } // Job is a long-running Metis action visible in the UI. type Job struct { ID string `json:"id"` Kind string `json:"kind"` Node string `json:"node,omitempty"` Host string `json:"host,omitempty"` Builder string `json:"builder,omitempty"` Device string `json:"device,omitempty"` Status JobStatus `json:"status"` Stage string `json:"stage,omitempty"` StageStartedAt time.Time `json:"stage_started_at,omitempty"` Message string `json:"message,omitempty"` Artifact string `json:"artifact,omitempty"` ProgressPct float64 `json:"progress_pct"` Written int64 `json:"written_bytes,omitempty"` Total int64 `json:"total_bytes,omitempty"` Error string `json:"error,omitempty"` StartedAt time.Time `json:"started_at"` UpdatedAt time.Time `json:"updated_at,omitempty"` FinishedAt time.Time `json:"finished_at,omitempty"` } // Event is a user-facing activity item for recent changes and runs. type Event struct { Time time.Time `json:"time"` Kind string `json:"kind"` Summary string `json:"summary"` Details map[string]any `json:"details,omitempty"` } // SnapshotRecord stores the last fact snapshot pushed by a node sentinel. type SnapshotRecord struct { Node string `json:"node"` CollectedAt time.Time `json:"collected_at"` Snapshot sentinel.Snapshot `json:"snapshot"` } // PageState is the UI/API view model. type PageState struct { LocalHost string `json:"local_host"` DefaultFlashHost string `json:"default_flash_host"` SelectedHost string `json:"selected_host"` FlashHosts []string `json:"flash_hosts"` Nodes []inventory.NodeSpec `json:"nodes"` Jobs []*Job `json:"jobs"` Devices []Device `json:"devices"` PreferredDevice string `json:"preferred_device,omitempty"` DeviceError string `json:"device_error,omitempty"` Events []Event `json:"events"` Snapshots []SnapshotRecord `json:"snapshots"` Targets map[string]facts.Targets `json:"targets"` Artifacts map[string]ArtifactSummary `json:"artifacts"` } // ArtifactSummary describes the latest built image for a node. type ArtifactSummary struct { Node string `json:"node,omitempty"` Ref string `json:"ref,omitempty"` BuildTag string `json:"build_tag,omitempty"` LocalPath string `json:"local_path,omitempty"` HostPath string `json:"host_path,omitempty"` BuilderHost string `json:"builder_host,omitempty"` Compressed bool `json:"compressed,omitempty"` UpdatedAt time.Time `json:"updated_at"` SizeBytes int64 `json:"size_bytes"` } type deviceSnapshot struct { Devices []Device Err string CheckedAt time.Time } // App coordinates builds, flashes, sentinel snapshots, and the web UI state. type App struct { settings Settings inventory *inventory.Inventory metrics *Metrics mu sync.RWMutex jobs map[string]*Job snapshots map[string]SnapshotRecord targets map[string]facts.Targets artifactStore map[string]ArtifactSummary deviceStore map[string]deviceSnapshot } // NewApp creates a Metis service app instance. func NewApp(settings Settings) (*App, error) { if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil { return nil, err } if err := os.MkdirAll(settings.ArtifactDir, 0o755); err != nil { return nil, err } if err := os.MkdirAll(filepath.Dir(settings.HistoryPath), 0o755); err != nil { return nil, err } inv, err := inventory.Load(settings.InventoryPath) if err != nil { return nil, err } app := &App{ settings: settings, inventory: inv, metrics: NewMetrics(), jobs: map[string]*Job{}, snapshots: map[string]SnapshotRecord{}, targets: map[string]facts.Targets{}, artifactStore: map[string]ArtifactSummary{}, deviceStore: map[string]deviceSnapshot{}, } _ = app.loadSnapshots() _ = app.loadTargets() _ = app.loadArtifacts() return app, nil } // State returns the current UI/API snapshot. func (a *App) State(deviceHost string) PageState { if strings.TrimSpace(deviceHost) == "" { deviceHost = a.settings.DefaultFlashHost } a.mu.RLock() jobs := make([]*Job, 0, len(a.jobs)) for _, job := range a.jobs { copyJob := *job jobs = append(jobs, ©Job) } sort.Slice(jobs, func(i, j int) bool { return jobs[i].StartedAt.After(jobs[j].StartedAt) }) snaps := make([]SnapshotRecord, 0, len(a.snapshots)) for _, snap := range a.snapshots { snaps = append(snaps, snap) } aTargets := map[string]facts.Targets{} for key, value := range a.targets { aTargets[key] = value } a.mu.RUnlock() sort.Slice(snaps, func(i, j int) bool { return snaps[i].Node < snaps[j].Node }) flashHosts := a.flashHosts() devices, deviceErr := a.cachedDevices(deviceHost) preferredDevice := preferredDevice(devices) return PageState{ LocalHost: a.settings.LocalHost, DefaultFlashHost: a.settings.DefaultFlashHost, SelectedHost: deviceHost, FlashHosts: flashHosts, Nodes: a.replacementNodes(), Jobs: jobs, Devices: devices, PreferredDevice: preferredDevice, DeviceError: errorString(deviceErr), Events: a.recentEvents(40), Snapshots: snaps, Targets: aTargets, Artifacts: a.artifacts(), } } // Build starts a background image build for a node. func (a *App) Build(node string) (*Job, error) { if err := a.ensureReplacementReady(node); err != nil { return nil, err } if active := a.activeJobForNode(node); active != nil { return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID} } job, err := a.reserveJob("build", node, "", "") if err != nil { return nil, err } go a.runBuild(job, false) return job, nil } // Replace starts a background build+flash workflow for a node. func (a *App) Replace(node, host, device string) (*Job, error) { if host == "" { host = a.settings.DefaultFlashHost } if err := a.ensureReplacementReady(node); err != nil { return nil, err } if active := a.activeJobForNode(node); active != nil { return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID} } if _, err := a.ensureDevice(host, device); err != nil { return nil, err } job, err := a.reserveJob("replace", node, host, device) if err != nil { return nil, err } go a.runBuild(job, true) return job, nil } // StoreSnapshot records a pushed sentinel snapshot. func (a *App) StoreSnapshot(record SnapshotRecord) error { if record.Node == "" { record.Node = record.Snapshot.Hostname } if record.CollectedAt.IsZero() { record.CollectedAt = time.Now().UTC() } if strings.TrimSpace(record.Node) == "" { return fmt.Errorf("snapshot node required") } a.mu.Lock() a.snapshots[record.Node] = record a.mu.Unlock() if err := a.persistSnapshots(); err != nil { return err } a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt) if err := a.syncScratchAnnotations(record); err != nil { a.appendEvent(annotationSyncEvent(record.Node, err)) } a.appendEvent(Event{ Time: record.CollectedAt, Kind: "sentinel.snapshot", Summary: fmt.Sprintf("Captured sentinel snapshot for %s", record.Node), Details: map[string]any{ "node": record.Node, "kernel": record.Snapshot.Kernel, "k3s_version": record.Snapshot.K3sVersion, }, }) return nil } // WatchSentinel recomputes class targets and logs meaningful drift. func (a *App) WatchSentinel() (*Event, error) { a.mu.RLock() snaps := make([]facts.Snapshot, 0, len(a.snapshots)) for _, snap := range a.snapshots { snaps = append(snaps, facts.Snapshot{ Hostname: snap.Node, Kernel: snap.Snapshot.Kernel, OSImage: snap.Snapshot.OSImage, K3sVersion: firstLine(snap.Snapshot.K3sVersion), Containerd: firstLine(snap.Snapshot.Containerd), PackageSample: snap.Snapshot.PackageSample, DropInsSample: snap.Snapshot.DropInsSample, USBScratch: snap.Snapshot.USBScratch, }) } prevTargets := map[string]facts.Targets{} for key, value := range a.targets { prevTargets[key] = value } a.mu.RUnlock() nextTargets := facts.RecommendTargets(a.inventory, snaps) changes := diffTargets(prevTargets, nextTargets) a.mu.Lock() a.targets = nextTargets a.mu.Unlock() if err := a.persistTargets(); err != nil { return nil, err } event := &Event{ Time: time.Now().UTC(), Kind: "sentinel.watch", Summary: "Metis sentinel watch completed with no template changes", Details: map[string]any{ "classes": len(nextTargets), "changes": 0, }, } if len(changes) > 0 { event.Summary = fmt.Sprintf("Metis sentinel watch detected %d template change(s)", len(changes)) event.Details["changes"] = changes } a.appendEvent(*event) a.metrics.RecordWatch("ok") a.metrics.SetDriftTargets(nextTargets, len(changes)) return event, nil }