service: smooth metis build and flash progress

This commit is contained in:
Brad Stein 2026-04-01 02:07:09 -03:00
parent 801374d184
commit c459e5c028
5 changed files with 167 additions and 22 deletions

View File

@ -42,23 +42,24 @@ type Device struct {
// Job is a long-running Metis action visible in the UI. // Job is a long-running Metis action visible in the UI.
type Job struct { type Job struct {
ID string `json:"id"` ID string `json:"id"`
Kind string `json:"kind"` Kind string `json:"kind"`
Node string `json:"node,omitempty"` Node string `json:"node,omitempty"`
Host string `json:"host,omitempty"` Host string `json:"host,omitempty"`
Builder string `json:"builder,omitempty"` Builder string `json:"builder,omitempty"`
Device string `json:"device,omitempty"` Device string `json:"device,omitempty"`
Status JobStatus `json:"status"` Status JobStatus `json:"status"`
Stage string `json:"stage,omitempty"` Stage string `json:"stage,omitempty"`
Message string `json:"message,omitempty"` StageStartedAt time.Time `json:"stage_started_at,omitempty"`
Artifact string `json:"artifact,omitempty"` Message string `json:"message,omitempty"`
ProgressPct float64 `json:"progress_pct"` Artifact string `json:"artifact,omitempty"`
Written int64 `json:"written_bytes,omitempty"` ProgressPct float64 `json:"progress_pct"`
Total int64 `json:"total_bytes,omitempty"` Written int64 `json:"written_bytes,omitempty"`
Error string `json:"error,omitempty"` Total int64 `json:"total_bytes,omitempty"`
StartedAt time.Time `json:"started_at"` Error string `json:"error,omitempty"`
UpdatedAt time.Time `json:"updated_at,omitempty"` StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at,omitempty"` UpdatedAt time.Time `json:"updated_at,omitempty"`
FinishedAt time.Time `json:"finished_at,omitempty"`
} }
// Event is a user-facing activity item for recent changes and runs. // Event is a user-facing activity item for recent changes and runs.

View File

@ -186,7 +186,7 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin
} }
lastState = state lastState = state
if strings.TrimSpace(jobID) != "" { if strings.TrimSpace(jobID) != "" {
a.setJob(jobID, func(_ *Job) {}) a.heartbeatRemoteJob(jobID)
} }
switch state.Phase { switch state.Phase {
case "Succeeded": case "Succeeded":

View File

@ -3,6 +3,7 @@ package service
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"path/filepath" "path/filepath"
"sort" "sort"
"strings" "strings"
@ -79,6 +80,7 @@ func (a *App) runBuild(job *Job, flash bool) {
a.setJob(job.ID, func(j *Job) { a.setJob(job.ID, func(j *Job) {
j.Status = JobRunning j.Status = JobRunning
j.Stage = "build" j.Stage = "build"
j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Building on %s (%s) and publishing to Harbor", builder.Name, builder.Arch) j.Message = fmt.Sprintf("Building on %s (%s) and publishing to Harbor", builder.Name, builder.Arch)
j.ProgressPct = 8 j.ProgressPct = 8
j.Artifact = artifactRef + ":latest" j.Artifact = artifactRef + ":latest"
@ -140,7 +142,8 @@ func (a *App) runBuild(job *Job, flash bool) {
a.setJob(job.ID, func(j *Job) { a.setJob(job.ID, func(j *Job) {
j.Stage = "preflight" j.Stage = "preflight"
j.Message = fmt.Sprintf("Preparing to flash from Harbor on %s", j.Host) j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host)
j.ProgressPct = 78 j.ProgressPct = 78
j.Artifact = artifactRef + ":latest" j.Artifact = artifactRef + ":latest"
}) })
@ -201,6 +204,7 @@ func (a *App) flashArtifact(jobID, artifactRef string) error {
} }
a.setJob(jobID, func(j *Job) { a.setJob(jobID, func(j *Job) {
j.Stage = "flash" j.Stage = "flash"
j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host) j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host)
j.ProgressPct = 84 j.ProgressPct = 84
}) })
@ -220,6 +224,97 @@ func (a *App) flashArtifact(jobID, artifactRef string) error {
return nil return nil
} }
func (a *App) heartbeatRemoteJob(jobID string) {
if strings.TrimSpace(jobID) == "" {
return
}
a.setJob(jobID, func(j *Job) {
if j == nil || j.Status != JobRunning {
return
}
stageStart := j.StageStartedAt
if stageStart.IsZero() {
stageStart = j.StartedAt
}
elapsed := time.Since(stageStart)
switch j.Stage {
case "build":
progress, message := buildStageHeartbeat(j.Node, j.Builder, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
}
if strings.TrimSpace(message) != "" {
j.Message = message
}
case "preflight":
if j.ProgressPct < 80 {
j.ProgressPct = 80
}
j.Message = fmt.Sprintf("Validating %s on %s and resolving the latest Harbor artifact", prettyDeviceTarget(j.Device), j.Host)
case "flash":
progress, message := flashStageHeartbeat(j.Host, j.Artifact, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
}
if strings.TrimSpace(message) != "" {
j.Message = message
}
}
})
}
func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
switch {
case seconds < 20:
return ramp(seconds, 0, 20, 8, 14), fmt.Sprintf("Scheduling a remote builder on %s for %s", builder, node)
case seconds < 120:
return ramp(seconds, 20, 120, 14, 30), fmt.Sprintf("Injecting %s recovery config into the base image on %s", node, builder)
case seconds < 360:
return ramp(seconds, 120, 360, 30, 58), fmt.Sprintf("Building the replacement image filesystem for %s on %s", node, builder)
case seconds < 540:
return ramp(seconds, 360, 540, 58, 70), fmt.Sprintf("Compressing the replacement image for %s before upload", node)
default:
return math.Min(76, ramp(seconds, 540, 900, 70, 76)), fmt.Sprintf("Publishing %s to Harbor and refreshing the latest tag", node)
}
}
func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
switch {
case seconds < 10:
return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
case seconds < 45:
return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host)
default:
return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host)
}
}
func prettyDeviceTarget(path string) string {
switch {
case strings.HasPrefix(path, "hosttmp://"):
return "/tmp"
case strings.TrimSpace(path) == "":
return "the selected target"
default:
return path
}
}
func ramp(value, start, end, min, max float64) float64 {
if end <= start {
return max
}
if value <= start {
return min
}
if value >= end {
return max
}
return min + ((value-start)/(end-start))*(max-min)
}
func (a *App) ensureDevice(host, path string) (*Device, error) { func (a *App) ensureDevice(host, path string) (*Device, error) {
if strings.TrimSpace(path) == "" { if strings.TrimSpace(path) == "" {
return nil, fmt.Errorf("select removable media before starting a flash run") return nil, fmt.Errorf("select removable media before starting a flash run")

View File

@ -0,0 +1,45 @@
package service
import (
"strings"
"testing"
"time"
)
func TestBuildStageHeartbeatProgresses(t *testing.T) {
p1, m1 := buildStageHeartbeat("titan-13", "titan-04", 10*time.Second)
p2, m2 := buildStageHeartbeat("titan-13", "titan-04", 3*time.Minute)
p3, m3 := buildStageHeartbeat("titan-13", "titan-04", 11*time.Minute)
if !(p1 > 8 && p1 < p2 && p2 < p3 && p3 <= 76) {
t.Fatalf("unexpected build progress values: %v %v %v", p1, p2, p3)
}
if !strings.Contains(m1, "Scheduling") {
t.Fatalf("expected scheduling message, got %q", m1)
}
if !strings.Contains(m2, "replacement image filesystem") {
t.Fatalf("expected filesystem message, got %q", m2)
}
if !strings.Contains(m3, "Harbor") {
t.Fatalf("expected Harbor message, got %q", m3)
}
}
func TestFlashStageHeartbeatProgresses(t *testing.T) {
p1, m1 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 5*time.Second)
p2, m2 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 20*time.Second)
p3, m3 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 90*time.Second)
if !(p1 > 84 && p1 < p2 && p2 < p3 && p3 <= 98) {
t.Fatalf("unexpected flash progress values: %v %v %v", p1, p2, p3)
}
if !strings.Contains(m1, "Pulling") {
t.Fatalf("expected pulling message, got %q", m1)
}
if !strings.Contains(m2, "Writing") {
t.Fatalf("expected writing message, got %q", m2)
}
if !strings.Contains(m3, "Flushing") {
t.Fatalf("expected flushing message, got %q", m3)
}
}

View File

@ -686,9 +686,13 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : ''); const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : '');
const timingBits = []; const timingBits = [];
if(job.stage){ timingBits.push('stage: ' + job.stage); } if(job.stage){ timingBits.push('stage: ' + job.stage); }
const duration = fmtDuration(job.started_at, job.finished_at); const stageDuration = fmtDuration(job.stage_started_at || job.started_at, job.finished_at);
if(duration){ if(stageDuration){
timingBits.push((job.status === 'running' ? 'elapsed ' : 'duration ') + duration); timingBits.push((job.status === 'running' ? 'stage elapsed ' : 'stage duration ') + stageDuration);
}
const totalDuration = fmtDuration(job.started_at, job.finished_at);
if(totalDuration && job.stage_started_at){
timingBits.push((job.status === 'running' ? 'total elapsed ' : 'total duration ') + totalDuration);
} }
if(job.updated_at && job.status === 'running'){ if(job.updated_at && job.status === 'running'){
timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago'); timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago');