diff --git a/cmd/metis/remote_cmd.go b/cmd/metis/remote_cmd.go index 1256972..aa7dd6c 100644 --- a/cmd/metis/remote_cmd.go +++ b/cmd/metis/remote_cmd.go @@ -40,10 +40,7 @@ func remoteDevicesCmd(args []string) { } return devices[i].Path < devices[j].Path }) - - enc := json.NewEncoder(os.Stdout) - enc.SetIndent("", " ") - _ = enc.Encode(map[string]any{"devices": devices}) + writeStructuredResult(map[string]any{"devices": devices}) } func remoteBuildCmd(args []string) { @@ -116,9 +113,7 @@ func remoteBuildCmd(args []string) { UpdatedAt: builtAt, SizeBytes: info.Size(), } - enc := json.NewEncoder(os.Stdout) - enc.SetIndent("", " ") - _ = enc.Encode(summary) + writeStructuredResult(summary) } func remoteFlashCmd(args []string) { @@ -177,9 +172,7 @@ func remoteFlashCmd(args []string) { if err != nil { log.Fatalf("stat destination: %v", err) } - enc := json.NewEncoder(os.Stdout) - enc.SetIndent("", " ") - _ = enc.Encode(map[string]any{ + writeStructuredResult(map[string]any{ "node": *node, "device": *device, "dest_path": destPath, @@ -187,6 +180,19 @@ func remoteFlashCmd(args []string) { }) } +func writeStructuredResult(payload any) { + data, err := json.Marshal(payload) + if err != nil { + log.Fatalf("encode result: %v", err) + } + if _, err := os.Stdout.Write(append(data, '\n')); err != nil { + log.Fatalf("write stdout result: %v", err) + } + // Keep the result available in pod status so Metis does not depend on the + // kubelet log endpoint for successful worker runs. + _ = os.WriteFile("/dev/termination-log", data, 0o644) +} + func localFlashDevices(maxBytes int64, hostTmpDir string) ([]service.Device, error) { cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE") out, err := cmd.Output() diff --git a/pkg/service/app.go b/pkg/service/app.go index d252f8f..06faa6a 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -57,6 +57,7 @@ type Job struct { Total int64 `json:"total_bytes,omitempty"` Error string `json:"error,omitempty"` StartedAt time.Time `json:"started_at"` + UpdatedAt time.Time `json:"updated_at,omitempty"` FinishedAt time.Time `json:"finished_at,omitempty"` } @@ -314,6 +315,7 @@ func (a *App) newJob(kind, node, host, device string) *Job { Status: JobQueued, ProgressPct: 0, StartedAt: time.Now().UTC(), + UpdatedAt: time.Now().UTC(), } a.mu.Lock() a.jobs[job.ID] = job @@ -335,6 +337,7 @@ func (a *App) setJob(id string, update func(*Job)) { return } update(job) + job.UpdatedAt = time.Now().UTC() } func (a *App) failJob(id string, err error) { @@ -356,6 +359,7 @@ func (a *App) completeJob(id string, update func(*Job)) { if job.Status != JobError { job.Status = JobDone } + job.UpdatedAt = time.Now().UTC() job.FinishedAt = time.Now().UTC() } diff --git a/pkg/service/cluster.go b/pkg/service/cluster.go index 2bc1677..2074670 100644 --- a/pkg/service/cluster.go +++ b/pkg/service/cluster.go @@ -178,15 +178,26 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin } deadline := time.Now().Add(12 * time.Minute) + lastState := podState{Name: podName} for time.Now().Before(deadline) { state, err := a.remotePodState(kube, podName) if err != nil { return "", err } + lastState = state + if strings.TrimSpace(jobID) != "" { + a.setJob(jobID, func(_ *Job) {}) + } switch state.Phase { case "Succeeded": + if strings.TrimSpace(state.Message) != "" { + return strings.TrimSpace(state.Message), nil + } return a.remotePodLogs(kube, podName) case "Failed": + if strings.TrimSpace(state.Message) != "" { + return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(state.Message)) + } logs, _ := a.remotePodLogs(kube, podName) if strings.TrimSpace(logs) != "" { return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(logs)) @@ -195,6 +206,9 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin } time.Sleep(2 * time.Second) } + if lastState.Phase != "" { + return "", fmt.Errorf("remote pod %s timed out in phase %s: %s %s", podName, lastState.Phase, strings.TrimSpace(lastState.Reason), strings.TrimSpace(lastState.Message)) + } return "", fmt.Errorf("remote pod %s timed out", podName) } @@ -268,7 +282,11 @@ func (a *App) remotePodLogs(kube *kubeClient, podName string) (string, error) { defer resp.Body.Close() if resp.StatusCode >= 300 { body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) - return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, strings.TrimSpace(string(body))) + message := strings.TrimSpace(string(body)) + if strings.Contains(message, "proxy error from 127.0.0.1:6443") || strings.Contains(message, "containerLogs") { + return "", fmt.Errorf("pod logs %s failed because Kubernetes could not reach the node kubelet log endpoint: %s", podName, message) + } + return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, message) } body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) if err != nil { diff --git a/pkg/service/server.go b/pkg/service/server.go index f70a104..04db640 100644 --- a/pkg/service/server.go +++ b/pkg/service/server.go @@ -580,6 +580,7 @@ var metisPage = template.Must(template.New("metis").Parse(` const boot = JSON.parse(document.getElementById('boot').textContent); let state = boot; let busy = false; + let lastJobAlert = ''; const nodeSelect = document.getElementById('node-select'); const hostSelect = document.getElementById('host-select'); @@ -615,6 +616,22 @@ var metisPage = template.Must(template.New("metis").Parse(` return size.toFixed(size >= 10 || idx === 0 ? 0 : 1) + ' ' + units[idx]; } + function fmtDuration(startValue, endValue){ + if(!startValue){ return ''; } + const start = new Date(startValue); + if(isNaN(start.getTime())){ return ''; } + const end = endValue ? new Date(endValue) : new Date(); + if(isNaN(end.getTime())){ return ''; } + let seconds = Math.max(0, Math.round((end.getTime() - start.getTime()) / 1000)); + const hours = Math.floor(seconds / 3600); + seconds -= hours * 3600; + const minutes = Math.floor(seconds / 60); + seconds -= minutes * 60; + if(hours){ return hours + 'h ' + minutes + 'm'; } + if(minutes){ return minutes + 'm ' + seconds + 's'; } + return seconds + 's'; + } + function banner(kind, title, text){ bannerEl.className = 'banner ' + kind; bannerTitleEl.textContent = title; @@ -667,6 +684,15 @@ var metisPage = template.Must(template.New("metis").Parse(` const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : '')); const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : ''); const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : ''); + const timingBits = []; + if(job.stage){ timingBits.push('stage: ' + job.stage); } + const duration = fmtDuration(job.started_at, job.finished_at); + if(duration){ + timingBits.push((job.status === 'running' ? 'elapsed ' : 'duration ') + duration); + } + if(job.updated_at && job.status === 'running'){ + timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago'); + } const detailBits = []; if(job.written_bytes){ detailBits.push(fmtBytes(job.written_bytes) + ' / ' + fmtBytes(job.total_bytes)); } if(job.artifact){ detailBits.push(job.artifact); } @@ -678,10 +704,20 @@ var metisPage = template.Must(template.New("metis").Parse(` '' + '