From c459e5c028ccd7325ef3c67e194a4c1126618fa7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 1 Apr 2026 02:07:09 -0300 Subject: [PATCH] service: smooth metis build and flash progress --- pkg/service/app.go | 35 ++++++----- pkg/service/cluster.go | 2 +- pkg/service/remote.go | 97 ++++++++++++++++++++++++++++- pkg/service/remote_progress_test.go | 45 +++++++++++++ pkg/service/server.go | 10 ++- 5 files changed, 167 insertions(+), 22 deletions(-) create mode 100644 pkg/service/remote_progress_test.go diff --git a/pkg/service/app.go b/pkg/service/app.go index 06faa6a..9e94804 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -42,23 +42,24 @@ type Device struct { // Job is a long-running Metis action visible in the UI. type Job struct { - ID string `json:"id"` - Kind string `json:"kind"` - Node string `json:"node,omitempty"` - Host string `json:"host,omitempty"` - Builder string `json:"builder,omitempty"` - Device string `json:"device,omitempty"` - Status JobStatus `json:"status"` - Stage string `json:"stage,omitempty"` - Message string `json:"message,omitempty"` - Artifact string `json:"artifact,omitempty"` - ProgressPct float64 `json:"progress_pct"` - Written int64 `json:"written_bytes,omitempty"` - Total int64 `json:"total_bytes,omitempty"` - Error string `json:"error,omitempty"` - StartedAt time.Time `json:"started_at"` - UpdatedAt time.Time `json:"updated_at,omitempty"` - FinishedAt time.Time `json:"finished_at,omitempty"` + ID string `json:"id"` + Kind string `json:"kind"` + Node string `json:"node,omitempty"` + Host string `json:"host,omitempty"` + Builder string `json:"builder,omitempty"` + Device string `json:"device,omitempty"` + Status JobStatus `json:"status"` + Stage string `json:"stage,omitempty"` + StageStartedAt time.Time `json:"stage_started_at,omitempty"` + Message string `json:"message,omitempty"` + Artifact string `json:"artifact,omitempty"` + ProgressPct float64 `json:"progress_pct"` + Written int64 `json:"written_bytes,omitempty"` + Total int64 `json:"total_bytes,omitempty"` + Error string `json:"error,omitempty"` + StartedAt time.Time `json:"started_at"` + UpdatedAt time.Time `json:"updated_at,omitempty"` + FinishedAt time.Time `json:"finished_at,omitempty"` } // Event is a user-facing activity item for recent changes and runs. diff --git a/pkg/service/cluster.go b/pkg/service/cluster.go index 2074670..ab43487 100644 --- a/pkg/service/cluster.go +++ b/pkg/service/cluster.go @@ -186,7 +186,7 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin } lastState = state if strings.TrimSpace(jobID) != "" { - a.setJob(jobID, func(_ *Job) {}) + a.heartbeatRemoteJob(jobID) } switch state.Phase { case "Succeeded": diff --git a/pkg/service/remote.go b/pkg/service/remote.go index 087f26b..d6c08a3 100644 --- a/pkg/service/remote.go +++ b/pkg/service/remote.go @@ -3,6 +3,7 @@ package service import ( "encoding/json" "fmt" + "math" "path/filepath" "sort" "strings" @@ -79,6 +80,7 @@ func (a *App) runBuild(job *Job, flash bool) { a.setJob(job.ID, func(j *Job) { j.Status = JobRunning j.Stage = "build" + j.StageStartedAt = time.Now().UTC() j.Message = fmt.Sprintf("Building on %s (%s) and publishing to Harbor", builder.Name, builder.Arch) j.ProgressPct = 8 j.Artifact = artifactRef + ":latest" @@ -140,7 +142,8 @@ func (a *App) runBuild(job *Job, flash bool) { a.setJob(job.ID, func(j *Job) { j.Stage = "preflight" - j.Message = fmt.Sprintf("Preparing to flash from Harbor on %s", j.Host) + j.StageStartedAt = time.Now().UTC() + j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host) j.ProgressPct = 78 j.Artifact = artifactRef + ":latest" }) @@ -201,6 +204,7 @@ func (a *App) flashArtifact(jobID, artifactRef string) error { } a.setJob(jobID, func(j *Job) { j.Stage = "flash" + j.StageStartedAt = time.Now().UTC() j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host) j.ProgressPct = 84 }) @@ -220,6 +224,97 @@ func (a *App) flashArtifact(jobID, artifactRef string) error { return nil } +func (a *App) heartbeatRemoteJob(jobID string) { + if strings.TrimSpace(jobID) == "" { + return + } + a.setJob(jobID, func(j *Job) { + if j == nil || j.Status != JobRunning { + return + } + stageStart := j.StageStartedAt + if stageStart.IsZero() { + stageStart = j.StartedAt + } + elapsed := time.Since(stageStart) + switch j.Stage { + case "build": + progress, message := buildStageHeartbeat(j.Node, j.Builder, elapsed) + if progress > j.ProgressPct { + j.ProgressPct = progress + } + if strings.TrimSpace(message) != "" { + j.Message = message + } + case "preflight": + if j.ProgressPct < 80 { + j.ProgressPct = 80 + } + j.Message = fmt.Sprintf("Validating %s on %s and resolving the latest Harbor artifact", prettyDeviceTarget(j.Device), j.Host) + case "flash": + progress, message := flashStageHeartbeat(j.Host, j.Artifact, elapsed) + if progress > j.ProgressPct { + j.ProgressPct = progress + } + if strings.TrimSpace(message) != "" { + j.Message = message + } + } + }) +} + +func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64, string) { + seconds := elapsed.Seconds() + switch { + case seconds < 20: + return ramp(seconds, 0, 20, 8, 14), fmt.Sprintf("Scheduling a remote builder on %s for %s", builder, node) + case seconds < 120: + return ramp(seconds, 20, 120, 14, 30), fmt.Sprintf("Injecting %s recovery config into the base image on %s", node, builder) + case seconds < 360: + return ramp(seconds, 120, 360, 30, 58), fmt.Sprintf("Building the replacement image filesystem for %s on %s", node, builder) + case seconds < 540: + return ramp(seconds, 360, 540, 58, 70), fmt.Sprintf("Compressing the replacement image for %s before upload", node) + default: + return math.Min(76, ramp(seconds, 540, 900, 70, 76)), fmt.Sprintf("Publishing %s to Harbor and refreshing the latest tag", node) + } +} + +func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) { + seconds := elapsed.Seconds() + switch { + case seconds < 10: + return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host) + case seconds < 45: + return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host) + default: + return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host) + } +} + +func prettyDeviceTarget(path string) string { + switch { + case strings.HasPrefix(path, "hosttmp://"): + return "/tmp" + case strings.TrimSpace(path) == "": + return "the selected target" + default: + return path + } +} + +func ramp(value, start, end, min, max float64) float64 { + if end <= start { + return max + } + if value <= start { + return min + } + if value >= end { + return max + } + return min + ((value-start)/(end-start))*(max-min) +} + func (a *App) ensureDevice(host, path string) (*Device, error) { if strings.TrimSpace(path) == "" { return nil, fmt.Errorf("select removable media before starting a flash run") diff --git a/pkg/service/remote_progress_test.go b/pkg/service/remote_progress_test.go new file mode 100644 index 0000000..3ad6885 --- /dev/null +++ b/pkg/service/remote_progress_test.go @@ -0,0 +1,45 @@ +package service + +import ( + "strings" + "testing" + "time" +) + +func TestBuildStageHeartbeatProgresses(t *testing.T) { + p1, m1 := buildStageHeartbeat("titan-13", "titan-04", 10*time.Second) + p2, m2 := buildStageHeartbeat("titan-13", "titan-04", 3*time.Minute) + p3, m3 := buildStageHeartbeat("titan-13", "titan-04", 11*time.Minute) + + if !(p1 > 8 && p1 < p2 && p2 < p3 && p3 <= 76) { + t.Fatalf("unexpected build progress values: %v %v %v", p1, p2, p3) + } + if !strings.Contains(m1, "Scheduling") { + t.Fatalf("expected scheduling message, got %q", m1) + } + if !strings.Contains(m2, "replacement image filesystem") { + t.Fatalf("expected filesystem message, got %q", m2) + } + if !strings.Contains(m3, "Harbor") { + t.Fatalf("expected Harbor message, got %q", m3) + } +} + +func TestFlashStageHeartbeatProgresses(t *testing.T) { + p1, m1 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 5*time.Second) + p2, m2 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 20*time.Second) + p3, m3 := flashStageHeartbeat("titan-22", "registry.bstein.dev/metis/titan-13:latest", 90*time.Second) + + if !(p1 > 84 && p1 < p2 && p2 < p3 && p3 <= 98) { + t.Fatalf("unexpected flash progress values: %v %v %v", p1, p2, p3) + } + if !strings.Contains(m1, "Pulling") { + t.Fatalf("expected pulling message, got %q", m1) + } + if !strings.Contains(m2, "Writing") { + t.Fatalf("expected writing message, got %q", m2) + } + if !strings.Contains(m3, "Flushing") { + t.Fatalf("expected flushing message, got %q", m3) + } +} diff --git a/pkg/service/server.go b/pkg/service/server.go index 04db640..fb13337 100644 --- a/pkg/service/server.go +++ b/pkg/service/server.go @@ -686,9 +686,13 @@ var metisPage = template.Must(template.New("metis").Parse(` const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : ''); const timingBits = []; if(job.stage){ timingBits.push('stage: ' + job.stage); } - const duration = fmtDuration(job.started_at, job.finished_at); - if(duration){ - timingBits.push((job.status === 'running' ? 'elapsed ' : 'duration ') + duration); + const stageDuration = fmtDuration(job.stage_started_at || job.started_at, job.finished_at); + if(stageDuration){ + timingBits.push((job.status === 'running' ? 'stage elapsed ' : 'stage duration ') + stageDuration); + } + const totalDuration = fmtDuration(job.started_at, job.finished_at); + if(totalDuration && job.stage_started_at){ + timingBits.push((job.status === 'running' ? 'total elapsed ' : 'total duration ') + totalDuration); } if(job.updated_at && job.status === 'running'){ timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago');