368 lines
11 KiB
Go
368 lines
11 KiB
Go
package service
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"metis/pkg/facts"
|
|
"metis/pkg/inventory"
|
|
"metis/pkg/sentinel"
|
|
)
|
|
|
|
// JobStatus identifies the current lifecycle state of a queued job because
|
|
// the UI and metrics need a stable shared vocabulary for progress updates.
|
|
type JobStatus string
|
|
|
|
const (
|
|
JobQueued JobStatus = "queued"
|
|
JobRunning JobStatus = "running"
|
|
JobDone JobStatus = "done"
|
|
JobError JobStatus = "error"
|
|
)
|
|
|
|
// Device describes a flashable block device.
|
|
type Device struct {
|
|
Name string `json:"name"`
|
|
Path string `json:"path"`
|
|
Model string `json:"model,omitempty"`
|
|
Transport string `json:"transport,omitempty"`
|
|
Type string `json:"type,omitempty"`
|
|
Note string `json:"note,omitempty"`
|
|
Removable bool `json:"removable"`
|
|
Hotplug bool `json:"hotplug"`
|
|
SizeBytes int64 `json:"size_bytes"`
|
|
}
|
|
|
|
// Job is a long-running Metis action visible in the UI.
|
|
type Job struct {
|
|
ID string `json:"id"`
|
|
Kind string `json:"kind"`
|
|
Node string `json:"node,omitempty"`
|
|
Host string `json:"host,omitempty"`
|
|
Builder string `json:"builder,omitempty"`
|
|
Device string `json:"device,omitempty"`
|
|
Status JobStatus `json:"status"`
|
|
Stage string `json:"stage,omitempty"`
|
|
StageStartedAt time.Time `json:"stage_started_at,omitempty"`
|
|
Message string `json:"message,omitempty"`
|
|
Artifact string `json:"artifact,omitempty"`
|
|
ProgressPct float64 `json:"progress_pct"`
|
|
Written int64 `json:"written_bytes,omitempty"`
|
|
Total int64 `json:"total_bytes,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
|
FinishedAt time.Time `json:"finished_at,omitempty"`
|
|
}
|
|
|
|
// Event is a user-facing activity item for recent changes and runs.
|
|
type Event struct {
|
|
Time time.Time `json:"time"`
|
|
Kind string `json:"kind"`
|
|
Summary string `json:"summary"`
|
|
Details map[string]any `json:"details,omitempty"`
|
|
}
|
|
|
|
// SnapshotRecord stores the last fact snapshot pushed by a node sentinel.
|
|
type SnapshotRecord struct {
|
|
Node string `json:"node"`
|
|
CollectedAt time.Time `json:"collected_at"`
|
|
Snapshot sentinel.Snapshot `json:"snapshot"`
|
|
}
|
|
|
|
// PageState is the UI/API view model.
|
|
type PageState struct {
|
|
LocalHost string `json:"local_host"`
|
|
DefaultFlashHost string `json:"default_flash_host"`
|
|
SelectedHost string `json:"selected_host"`
|
|
FlashHosts []string `json:"flash_hosts"`
|
|
Nodes []inventory.NodeSpec `json:"nodes"`
|
|
Jobs []*Job `json:"jobs"`
|
|
Devices []Device `json:"devices"`
|
|
PreferredDevice string `json:"preferred_device,omitempty"`
|
|
DeviceError string `json:"device_error,omitempty"`
|
|
Events []Event `json:"events"`
|
|
Snapshots []SnapshotRecord `json:"snapshots"`
|
|
Targets map[string]facts.Targets `json:"targets"`
|
|
Artifacts map[string]ArtifactSummary `json:"artifacts"`
|
|
}
|
|
|
|
// ArtifactSummary describes the latest built image for a node.
|
|
type ArtifactSummary struct {
|
|
Node string `json:"node,omitempty"`
|
|
Ref string `json:"ref,omitempty"`
|
|
BuildTag string `json:"build_tag,omitempty"`
|
|
LocalPath string `json:"local_path,omitempty"`
|
|
HostPath string `json:"host_path,omitempty"`
|
|
BuilderHost string `json:"builder_host,omitempty"`
|
|
Compressed bool `json:"compressed,omitempty"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
SizeBytes int64 `json:"size_bytes"`
|
|
}
|
|
|
|
type deviceSnapshot struct {
|
|
Devices []Device
|
|
Err string
|
|
CheckedAt time.Time
|
|
}
|
|
|
|
// App coordinates builds, flashes, sentinel snapshots, and the web UI state.
|
|
type App struct {
|
|
settings Settings
|
|
inventory *inventory.Inventory
|
|
metrics *Metrics
|
|
|
|
mu sync.RWMutex
|
|
jobs map[string]*Job
|
|
snapshots map[string]SnapshotRecord
|
|
targets map[string]facts.Targets
|
|
artifactStore map[string]ArtifactSummary
|
|
deviceStore map[string]deviceSnapshot
|
|
desiredMetadata map[string]DesiredNodeMetadata
|
|
}
|
|
|
|
// NewApp creates a Metis service app instance.
|
|
func NewApp(settings Settings) (*App, error) {
|
|
if strings.TrimSpace(settings.DesiredMetadataPath) == "" {
|
|
baseDir := filepath.Dir(settings.SnapshotsPath)
|
|
if strings.TrimSpace(baseDir) == "" || baseDir == "." {
|
|
baseDir = filepath.Dir(settings.HistoryPath)
|
|
}
|
|
settings.DesiredMetadataPath = filepath.Join(baseDir, "desired-node-metadata.json")
|
|
}
|
|
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := os.MkdirAll(settings.ArtifactDir, 0o755); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(settings.HistoryPath), 0o755); err != nil {
|
|
return nil, err
|
|
}
|
|
inv, err := inventory.Load(settings.InventoryPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
app := &App{
|
|
settings: settings,
|
|
inventory: inv,
|
|
metrics: NewMetrics(),
|
|
jobs: map[string]*Job{},
|
|
snapshots: map[string]SnapshotRecord{},
|
|
targets: map[string]facts.Targets{},
|
|
artifactStore: map[string]ArtifactSummary{},
|
|
deviceStore: map[string]deviceSnapshot{},
|
|
desiredMetadata: map[string]DesiredNodeMetadata{},
|
|
}
|
|
_ = app.loadSnapshots()
|
|
_ = app.loadTargets()
|
|
_ = app.loadArtifacts()
|
|
_ = app.loadDesiredNodeMetadata()
|
|
return app, nil
|
|
}
|
|
|
|
// State returns the current UI/API snapshot.
|
|
func (a *App) State(deviceHost string) PageState {
|
|
if strings.TrimSpace(deviceHost) == "" {
|
|
deviceHost = a.settings.DefaultFlashHost
|
|
}
|
|
a.mu.RLock()
|
|
jobs := make([]*Job, 0, len(a.jobs))
|
|
for _, job := range a.jobs {
|
|
copyJob := *job
|
|
jobs = append(jobs, ©Job)
|
|
}
|
|
sort.Slice(jobs, func(i, j int) bool {
|
|
return jobs[i].StartedAt.After(jobs[j].StartedAt)
|
|
})
|
|
|
|
snaps := make([]SnapshotRecord, 0, len(a.snapshots))
|
|
for _, snap := range a.snapshots {
|
|
snaps = append(snaps, snap)
|
|
}
|
|
aTargets := map[string]facts.Targets{}
|
|
for key, value := range a.targets {
|
|
aTargets[key] = value
|
|
}
|
|
a.mu.RUnlock()
|
|
|
|
sort.Slice(snaps, func(i, j int) bool {
|
|
return snaps[i].Node < snaps[j].Node
|
|
})
|
|
|
|
flashHosts := a.flashHosts()
|
|
devices, deviceErr := a.cachedDevices(deviceHost)
|
|
preferredDevice := preferredDevice(devices)
|
|
return PageState{
|
|
LocalHost: a.settings.LocalHost,
|
|
DefaultFlashHost: a.settings.DefaultFlashHost,
|
|
SelectedHost: deviceHost,
|
|
FlashHosts: flashHosts,
|
|
Nodes: a.replacementNodes(),
|
|
Jobs: jobs,
|
|
Devices: devices,
|
|
PreferredDevice: preferredDevice,
|
|
DeviceError: errorString(deviceErr),
|
|
Events: a.recentEvents(40),
|
|
Snapshots: snaps,
|
|
Targets: aTargets,
|
|
Artifacts: a.artifacts(),
|
|
}
|
|
}
|
|
|
|
// Build starts a background image build for a node.
|
|
func (a *App) Build(node string) (*Job, error) {
|
|
if err := a.ensureReplacementReady(node); err != nil {
|
|
return nil, err
|
|
}
|
|
if active := a.activeJobForNode(node); active != nil {
|
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
|
}
|
|
job, err := a.reserveJob("build", node, "", "")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
go a.runBuild(job, false)
|
|
return job, nil
|
|
}
|
|
|
|
// Flash starts a background flash-only workflow for the latest published node image.
|
|
func (a *App) Flash(node, host, device string) (*Job, error) {
|
|
if host == "" {
|
|
host = a.settings.DefaultFlashHost
|
|
}
|
|
if err := a.ensureReplacementReady(node); err != nil {
|
|
return nil, err
|
|
}
|
|
if active := a.activeJobForNode(node); active != nil {
|
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
|
}
|
|
if summary, ok := a.artifacts()[node]; !ok || strings.TrimSpace(summary.Ref) == "" {
|
|
return nil, fmt.Errorf("no published image recorded for %s yet; run a build first", node)
|
|
}
|
|
if _, err := a.ensureDevice(host, device); err != nil {
|
|
return nil, err
|
|
}
|
|
job, err := a.reserveJob("flash", node, host, device)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
go a.runFlash(job)
|
|
return job, nil
|
|
}
|
|
|
|
// Replace starts a background build+flash workflow for a node.
|
|
func (a *App) Replace(node, host, device string) (*Job, error) {
|
|
if host == "" {
|
|
host = a.settings.DefaultFlashHost
|
|
}
|
|
if err := a.ensureReplacementReady(node); err != nil {
|
|
return nil, err
|
|
}
|
|
if active := a.activeJobForNode(node); active != nil {
|
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
|
}
|
|
if _, err := a.ensureDevice(host, device); err != nil {
|
|
return nil, err
|
|
}
|
|
job, err := a.reserveJob("replace", node, host, device)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
go a.runBuild(job, true)
|
|
return job, nil
|
|
}
|
|
|
|
// StoreSnapshot records a pushed sentinel snapshot.
|
|
func (a *App) StoreSnapshot(record SnapshotRecord) error {
|
|
if record.Node == "" {
|
|
record.Node = record.Snapshot.Hostname
|
|
}
|
|
if record.CollectedAt.IsZero() {
|
|
record.CollectedAt = time.Now().UTC()
|
|
}
|
|
if strings.TrimSpace(record.Node) == "" {
|
|
return fmt.Errorf("snapshot node required")
|
|
}
|
|
a.mu.Lock()
|
|
a.snapshots[record.Node] = record
|
|
a.mu.Unlock()
|
|
if err := a.persistSnapshots(); err != nil {
|
|
return err
|
|
}
|
|
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
|
|
if err := a.syncScratchAnnotations(record); err != nil {
|
|
a.appendEvent(annotationSyncEvent(record.Node, err))
|
|
}
|
|
if err := a.syncDesiredNodeMetadata(record); err != nil {
|
|
a.appendEvent(desiredNodeMetadataSyncEvent(record.Node, err))
|
|
}
|
|
a.appendEvent(Event{
|
|
Time: record.CollectedAt,
|
|
Kind: "sentinel.snapshot",
|
|
Summary: fmt.Sprintf("Captured sentinel snapshot for %s", record.Node),
|
|
Details: map[string]any{
|
|
"node": record.Node,
|
|
"kernel": record.Snapshot.Kernel,
|
|
"k3s_version": record.Snapshot.K3sVersion,
|
|
},
|
|
})
|
|
return nil
|
|
}
|
|
|
|
// WatchSentinel recomputes class targets and logs meaningful drift.
|
|
func (a *App) WatchSentinel() (*Event, error) {
|
|
a.mu.RLock()
|
|
snaps := make([]facts.Snapshot, 0, len(a.snapshots))
|
|
for _, snap := range a.snapshots {
|
|
snaps = append(snaps, facts.Snapshot{
|
|
Hostname: snap.Node,
|
|
Kernel: snap.Snapshot.Kernel,
|
|
OSImage: snap.Snapshot.OSImage,
|
|
K3sVersion: firstLine(snap.Snapshot.K3sVersion),
|
|
Containerd: firstLine(snap.Snapshot.Containerd),
|
|
PackageSample: snap.Snapshot.PackageSample,
|
|
DropInsSample: snap.Snapshot.DropInsSample,
|
|
USBScratch: snap.Snapshot.USBScratch,
|
|
})
|
|
}
|
|
prevTargets := map[string]facts.Targets{}
|
|
for key, value := range a.targets {
|
|
prevTargets[key] = value
|
|
}
|
|
a.mu.RUnlock()
|
|
|
|
nextTargets := facts.RecommendTargets(a.inventory, snaps)
|
|
changes := diffTargets(prevTargets, nextTargets)
|
|
|
|
a.mu.Lock()
|
|
a.targets = nextTargets
|
|
a.mu.Unlock()
|
|
if err := a.persistTargets(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
event := &Event{
|
|
Time: time.Now().UTC(),
|
|
Kind: "sentinel.watch",
|
|
Summary: "Metis sentinel watch completed with no template changes",
|
|
Details: map[string]any{
|
|
"classes": len(nextTargets),
|
|
"changes": 0,
|
|
},
|
|
}
|
|
if len(changes) > 0 {
|
|
event.Summary = fmt.Sprintf("Metis sentinel watch detected %d template change(s)", len(changes))
|
|
event.Details["changes"] = changes
|
|
}
|
|
a.appendEvent(*event)
|
|
a.metrics.RecordWatch("ok")
|
|
a.metrics.SetDriftTargets(nextTargets, len(changes))
|
|
return event, nil
|
|
}
|