service: avoid kubelet log dependency for remote workers

This commit is contained in:
Brad Stein 2026-04-01 01:45:44 -03:00
parent bd61275821
commit 801374d184
4 changed files with 75 additions and 11 deletions

View File

@ -40,10 +40,7 @@ func remoteDevicesCmd(args []string) {
}
return devices[i].Path < devices[j].Path
})
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
_ = enc.Encode(map[string]any{"devices": devices})
writeStructuredResult(map[string]any{"devices": devices})
}
func remoteBuildCmd(args []string) {
@ -116,9 +113,7 @@ func remoteBuildCmd(args []string) {
UpdatedAt: builtAt,
SizeBytes: info.Size(),
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
_ = enc.Encode(summary)
writeStructuredResult(summary)
}
func remoteFlashCmd(args []string) {
@ -177,9 +172,7 @@ func remoteFlashCmd(args []string) {
if err != nil {
log.Fatalf("stat destination: %v", err)
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
_ = enc.Encode(map[string]any{
writeStructuredResult(map[string]any{
"node": *node,
"device": *device,
"dest_path": destPath,
@ -187,6 +180,19 @@ func remoteFlashCmd(args []string) {
})
}
func writeStructuredResult(payload any) {
data, err := json.Marshal(payload)
if err != nil {
log.Fatalf("encode result: %v", err)
}
if _, err := os.Stdout.Write(append(data, '\n')); err != nil {
log.Fatalf("write stdout result: %v", err)
}
// Keep the result available in pod status so Metis does not depend on the
// kubelet log endpoint for successful worker runs.
_ = os.WriteFile("/dev/termination-log", data, 0o644)
}
func localFlashDevices(maxBytes int64, hostTmpDir string) ([]service.Device, error) {
cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE")
out, err := cmd.Output()

View File

@ -57,6 +57,7 @@ type Job struct {
Total int64 `json:"total_bytes,omitempty"`
Error string `json:"error,omitempty"`
StartedAt time.Time `json:"started_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
FinishedAt time.Time `json:"finished_at,omitempty"`
}
@ -314,6 +315,7 @@ func (a *App) newJob(kind, node, host, device string) *Job {
Status: JobQueued,
ProgressPct: 0,
StartedAt: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
}
a.mu.Lock()
a.jobs[job.ID] = job
@ -335,6 +337,7 @@ func (a *App) setJob(id string, update func(*Job)) {
return
}
update(job)
job.UpdatedAt = time.Now().UTC()
}
func (a *App) failJob(id string, err error) {
@ -356,6 +359,7 @@ func (a *App) completeJob(id string, update func(*Job)) {
if job.Status != JobError {
job.Status = JobDone
}
job.UpdatedAt = time.Now().UTC()
job.FinishedAt = time.Now().UTC()
}

View File

@ -178,15 +178,26 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin
}
deadline := time.Now().Add(12 * time.Minute)
lastState := podState{Name: podName}
for time.Now().Before(deadline) {
state, err := a.remotePodState(kube, podName)
if err != nil {
return "", err
}
lastState = state
if strings.TrimSpace(jobID) != "" {
a.setJob(jobID, func(_ *Job) {})
}
switch state.Phase {
case "Succeeded":
if strings.TrimSpace(state.Message) != "" {
return strings.TrimSpace(state.Message), nil
}
return a.remotePodLogs(kube, podName)
case "Failed":
if strings.TrimSpace(state.Message) != "" {
return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(state.Message))
}
logs, _ := a.remotePodLogs(kube, podName)
if strings.TrimSpace(logs) != "" {
return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(logs))
@ -195,6 +206,9 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin
}
time.Sleep(2 * time.Second)
}
if lastState.Phase != "" {
return "", fmt.Errorf("remote pod %s timed out in phase %s: %s %s", podName, lastState.Phase, strings.TrimSpace(lastState.Reason), strings.TrimSpace(lastState.Message))
}
return "", fmt.Errorf("remote pod %s timed out", podName)
}
@ -268,7 +282,11 @@ func (a *App) remotePodLogs(kube *kubeClient, podName string) (string, error) {
defer resp.Body.Close()
if resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, strings.TrimSpace(string(body)))
message := strings.TrimSpace(string(body))
if strings.Contains(message, "proxy error from 127.0.0.1:6443") || strings.Contains(message, "containerLogs") {
return "", fmt.Errorf("pod logs %s failed because Kubernetes could not reach the node kubelet log endpoint: %s", podName, message)
}
return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, message)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if err != nil {

View File

@ -580,6 +580,7 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
const boot = JSON.parse(document.getElementById('boot').textContent);
let state = boot;
let busy = false;
let lastJobAlert = '';
const nodeSelect = document.getElementById('node-select');
const hostSelect = document.getElementById('host-select');
@ -615,6 +616,22 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
return size.toFixed(size >= 10 || idx === 0 ? 0 : 1) + ' ' + units[idx];
}
function fmtDuration(startValue, endValue){
if(!startValue){ return ''; }
const start = new Date(startValue);
if(isNaN(start.getTime())){ return ''; }
const end = endValue ? new Date(endValue) : new Date();
if(isNaN(end.getTime())){ return ''; }
let seconds = Math.max(0, Math.round((end.getTime() - start.getTime()) / 1000));
const hours = Math.floor(seconds / 3600);
seconds -= hours * 3600;
const minutes = Math.floor(seconds / 60);
seconds -= minutes * 60;
if(hours){ return hours + 'h ' + minutes + 'm'; }
if(minutes){ return minutes + 'm ' + seconds + 's'; }
return seconds + 's';
}
function banner(kind, title, text){
bannerEl.className = 'banner ' + kind;
bannerTitleEl.textContent = title;
@ -667,6 +684,15 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : ''));
const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : '');
const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : '');
const timingBits = [];
if(job.stage){ timingBits.push('stage: ' + job.stage); }
const duration = fmtDuration(job.started_at, job.finished_at);
if(duration){
timingBits.push((job.status === 'running' ? 'elapsed ' : 'duration ') + duration);
}
if(job.updated_at && job.status === 'running'){
timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago');
}
const detailBits = [];
if(job.written_bytes){ detailBits.push(fmtBytes(job.written_bytes) + ' / ' + fmtBytes(job.total_bytes)); }
if(job.artifact){ detailBits.push(job.artifact); }
@ -678,10 +704,20 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
'</div>' +
'<div>' + (job.message || job.stage || 'queued') + '</div>' +
'<div class="meta">' + started + '</div>' +
'<div class="meta">' + timingBits.join(' · ') + '</div>' +
'<div class="meta">' + detailBits.join(' · ') + '</div>' +
'<div class="bar"><span style="width:' + Math.max(0, Math.min(100, job.progress_pct || 0)) + '%"></span></div>';
jobsEl.appendChild(wrap);
});
const newestError = jobs.find((job)=>job.status === 'error');
if(newestError){
const signature = [newestError.id, newestError.error || newestError.message || newestError.stage || 'error'].join(':');
if(signature !== lastJobAlert){
lastJobAlert = signature;
banner('error', 'Metis job failed', newestError.error || newestError.message || 'Check the live jobs panel for details.');
}
}
}
function renderEvents(){