metis/pkg/service/app.go

314 lines
9.4 KiB
Go
Raw Normal View History

package service
import (
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"metis/pkg/facts"
"metis/pkg/inventory"
"metis/pkg/sentinel"
)
// JobStatus identifies the current lifecycle state of a queued job because
// the UI and metrics need a stable shared vocabulary for progress updates.
type JobStatus string
const (
JobQueued JobStatus = "queued"
JobRunning JobStatus = "running"
JobDone JobStatus = "done"
JobError JobStatus = "error"
)
// Device describes a flashable block device.
type Device struct {
Name string `json:"name"`
Path string `json:"path"`
Model string `json:"model,omitempty"`
Transport string `json:"transport,omitempty"`
Type string `json:"type,omitempty"`
Note string `json:"note,omitempty"`
Removable bool `json:"removable"`
Hotplug bool `json:"hotplug"`
SizeBytes int64 `json:"size_bytes"`
}
// Job is a long-running Metis action visible in the UI.
type Job struct {
ID string `json:"id"`
Kind string `json:"kind"`
Node string `json:"node,omitempty"`
Host string `json:"host,omitempty"`
Builder string `json:"builder,omitempty"`
Device string `json:"device,omitempty"`
Status JobStatus `json:"status"`
Stage string `json:"stage,omitempty"`
StageStartedAt time.Time `json:"stage_started_at,omitempty"`
Message string `json:"message,omitempty"`
Artifact string `json:"artifact,omitempty"`
ProgressPct float64 `json:"progress_pct"`
Written int64 `json:"written_bytes,omitempty"`
Total int64 `json:"total_bytes,omitempty"`
Error string `json:"error,omitempty"`
StartedAt time.Time `json:"started_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
FinishedAt time.Time `json:"finished_at,omitempty"`
}
// Event is a user-facing activity item for recent changes and runs.
type Event struct {
Time time.Time `json:"time"`
Kind string `json:"kind"`
Summary string `json:"summary"`
Details map[string]any `json:"details,omitempty"`
}
// SnapshotRecord stores the last fact snapshot pushed by a node sentinel.
type SnapshotRecord struct {
Node string `json:"node"`
CollectedAt time.Time `json:"collected_at"`
Snapshot sentinel.Snapshot `json:"snapshot"`
}
// PageState is the UI/API view model.
type PageState struct {
LocalHost string `json:"local_host"`
DefaultFlashHost string `json:"default_flash_host"`
SelectedHost string `json:"selected_host"`
FlashHosts []string `json:"flash_hosts"`
Nodes []inventory.NodeSpec `json:"nodes"`
Jobs []*Job `json:"jobs"`
Devices []Device `json:"devices"`
PreferredDevice string `json:"preferred_device,omitempty"`
DeviceError string `json:"device_error,omitempty"`
Events []Event `json:"events"`
Snapshots []SnapshotRecord `json:"snapshots"`
Targets map[string]facts.Targets `json:"targets"`
Artifacts map[string]ArtifactSummary `json:"artifacts"`
}
// ArtifactSummary describes the latest built image for a node.
type ArtifactSummary struct {
Node string `json:"node,omitempty"`
Ref string `json:"ref,omitempty"`
BuildTag string `json:"build_tag,omitempty"`
LocalPath string `json:"local_path,omitempty"`
HostPath string `json:"host_path,omitempty"`
BuilderHost string `json:"builder_host,omitempty"`
Compressed bool `json:"compressed,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
SizeBytes int64 `json:"size_bytes"`
}
type deviceSnapshot struct {
Devices []Device
Err string
CheckedAt time.Time
}
// App coordinates builds, flashes, sentinel snapshots, and the web UI state.
type App struct {
settings Settings
inventory *inventory.Inventory
metrics *Metrics
mu sync.RWMutex
jobs map[string]*Job
snapshots map[string]SnapshotRecord
targets map[string]facts.Targets
artifactStore map[string]ArtifactSummary
deviceStore map[string]deviceSnapshot
}
// NewApp creates a Metis service app instance.
func NewApp(settings Settings) (*App, error) {
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
return nil, err
}
if err := os.MkdirAll(settings.ArtifactDir, 0o755); err != nil {
return nil, err
}
if err := os.MkdirAll(filepath.Dir(settings.HistoryPath), 0o755); err != nil {
return nil, err
}
inv, err := inventory.Load(settings.InventoryPath)
if err != nil {
return nil, err
}
app := &App{
settings: settings,
inventory: inv,
metrics: NewMetrics(),
jobs: map[string]*Job{},
snapshots: map[string]SnapshotRecord{},
targets: map[string]facts.Targets{},
artifactStore: map[string]ArtifactSummary{},
deviceStore: map[string]deviceSnapshot{},
}
_ = app.loadSnapshots()
_ = app.loadTargets()
_ = app.loadArtifacts()
return app, nil
}
// State returns the current UI/API snapshot.
func (a *App) State(deviceHost string) PageState {
if strings.TrimSpace(deviceHost) == "" {
deviceHost = a.settings.DefaultFlashHost
}
a.mu.RLock()
jobs := make([]*Job, 0, len(a.jobs))
for _, job := range a.jobs {
copyJob := *job
jobs = append(jobs, &copyJob)
}
sort.Slice(jobs, func(i, j int) bool {
return jobs[i].StartedAt.After(jobs[j].StartedAt)
})
snaps := make([]SnapshotRecord, 0, len(a.snapshots))
for _, snap := range a.snapshots {
snaps = append(snaps, snap)
}
aTargets := map[string]facts.Targets{}
for key, value := range a.targets {
aTargets[key] = value
}
a.mu.RUnlock()
sort.Slice(snaps, func(i, j int) bool {
return snaps[i].Node < snaps[j].Node
})
flashHosts := a.flashHosts()
devices, deviceErr := a.cachedDevices(deviceHost)
preferredDevice := preferredDevice(devices)
return PageState{
LocalHost: a.settings.LocalHost,
DefaultFlashHost: a.settings.DefaultFlashHost,
SelectedHost: deviceHost,
FlashHosts: flashHosts,
Nodes: a.replacementNodes(),
Jobs: jobs,
Devices: devices,
PreferredDevice: preferredDevice,
DeviceError: errorString(deviceErr),
Events: a.recentEvents(40),
Snapshots: snaps,
Targets: aTargets,
Artifacts: a.artifacts(),
}
}
// Build starts a background image build for a node.
func (a *App) Build(node string) (*Job, error) {
if err := a.ensureReplacementReady(node); err != nil {
return nil, err
}
job := a.newJob("build", node, "", "")
go a.runBuild(job, false)
return job, nil
}
// Replace starts a background build+flash workflow for a node.
func (a *App) Replace(node, host, device string) (*Job, error) {
if host == "" {
host = a.settings.DefaultFlashHost
}
if err := a.ensureReplacementReady(node); err != nil {
return nil, err
}
if _, err := a.ensureDevice(host, device); err != nil {
return nil, err
}
job := a.newJob("replace", node, host, device)
go a.runBuild(job, true)
return job, nil
}
// StoreSnapshot records a pushed sentinel snapshot.
func (a *App) StoreSnapshot(record SnapshotRecord) error {
if record.Node == "" {
record.Node = record.Snapshot.Hostname
}
if record.CollectedAt.IsZero() {
record.CollectedAt = time.Now().UTC()
}
if strings.TrimSpace(record.Node) == "" {
return fmt.Errorf("snapshot node required")
}
a.mu.Lock()
a.snapshots[record.Node] = record
a.mu.Unlock()
if err := a.persistSnapshots(); err != nil {
return err
}
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
a.appendEvent(Event{
Time: record.CollectedAt,
Kind: "sentinel.snapshot",
Summary: fmt.Sprintf("Captured sentinel snapshot for %s", record.Node),
Details: map[string]any{
"node": record.Node,
"kernel": record.Snapshot.Kernel,
"k3s_version": record.Snapshot.K3sVersion,
},
})
return nil
}
// WatchSentinel recomputes class targets and logs meaningful drift.
func (a *App) WatchSentinel() (*Event, error) {
a.mu.RLock()
snaps := make([]facts.Snapshot, 0, len(a.snapshots))
for _, snap := range a.snapshots {
snaps = append(snaps, facts.Snapshot{
Hostname: snap.Node,
Kernel: snap.Snapshot.Kernel,
OSImage: snap.Snapshot.OSImage,
K3sVersion: firstLine(snap.Snapshot.K3sVersion),
Containerd: firstLine(snap.Snapshot.Containerd),
PackageSample: snap.Snapshot.PackageSample,
DropInsSample: snap.Snapshot.DropInsSample,
})
}
prevTargets := map[string]facts.Targets{}
for key, value := range a.targets {
prevTargets[key] = value
}
a.mu.RUnlock()
nextTargets := facts.RecommendTargets(a.inventory, snaps)
changes := diffTargets(prevTargets, nextTargets)
a.mu.Lock()
a.targets = nextTargets
a.mu.Unlock()
if err := a.persistTargets(); err != nil {
return nil, err
}
event := &Event{
Time: time.Now().UTC(),
Kind: "sentinel.watch",
Summary: "Metis sentinel watch completed with no template changes",
Details: map[string]any{
"classes": len(nextTargets),
"changes": 0,
},
}
if len(changes) > 0 {
event.Summary = fmt.Sprintf("Metis sentinel watch detected %d template change(s)", len(changes))
event.Details["changes"] = changes
}
a.appendEvent(*event)
a.metrics.RecordWatch("ok")
a.metrics.SetDriftTargets(nextTargets, len(changes))
return event, nil
}