service: avoid kubelet log dependency for remote workers
This commit is contained in:
parent
bd61275821
commit
801374d184
@ -40,10 +40,7 @@ func remoteDevicesCmd(args []string) {
|
||||
}
|
||||
return devices[i].Path < devices[j].Path
|
||||
})
|
||||
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
_ = enc.Encode(map[string]any{"devices": devices})
|
||||
writeStructuredResult(map[string]any{"devices": devices})
|
||||
}
|
||||
|
||||
func remoteBuildCmd(args []string) {
|
||||
@ -116,9 +113,7 @@ func remoteBuildCmd(args []string) {
|
||||
UpdatedAt: builtAt,
|
||||
SizeBytes: info.Size(),
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
_ = enc.Encode(summary)
|
||||
writeStructuredResult(summary)
|
||||
}
|
||||
|
||||
func remoteFlashCmd(args []string) {
|
||||
@ -177,9 +172,7 @@ func remoteFlashCmd(args []string) {
|
||||
if err != nil {
|
||||
log.Fatalf("stat destination: %v", err)
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
_ = enc.Encode(map[string]any{
|
||||
writeStructuredResult(map[string]any{
|
||||
"node": *node,
|
||||
"device": *device,
|
||||
"dest_path": destPath,
|
||||
@ -187,6 +180,19 @@ func remoteFlashCmd(args []string) {
|
||||
})
|
||||
}
|
||||
|
||||
func writeStructuredResult(payload any) {
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
log.Fatalf("encode result: %v", err)
|
||||
}
|
||||
if _, err := os.Stdout.Write(append(data, '\n')); err != nil {
|
||||
log.Fatalf("write stdout result: %v", err)
|
||||
}
|
||||
// Keep the result available in pod status so Metis does not depend on the
|
||||
// kubelet log endpoint for successful worker runs.
|
||||
_ = os.WriteFile("/dev/termination-log", data, 0o644)
|
||||
}
|
||||
|
||||
func localFlashDevices(maxBytes int64, hostTmpDir string) ([]service.Device, error) {
|
||||
cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE")
|
||||
out, err := cmd.Output()
|
||||
|
||||
@ -57,6 +57,7 @@ type Job struct {
|
||||
Total int64 `json:"total_bytes,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
FinishedAt time.Time `json:"finished_at,omitempty"`
|
||||
}
|
||||
|
||||
@ -314,6 +315,7 @@ func (a *App) newJob(kind, node, host, device string) *Job {
|
||||
Status: JobQueued,
|
||||
ProgressPct: 0,
|
||||
StartedAt: time.Now().UTC(),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.jobs[job.ID] = job
|
||||
@ -335,6 +337,7 @@ func (a *App) setJob(id string, update func(*Job)) {
|
||||
return
|
||||
}
|
||||
update(job)
|
||||
job.UpdatedAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
func (a *App) failJob(id string, err error) {
|
||||
@ -356,6 +359,7 @@ func (a *App) completeJob(id string, update func(*Job)) {
|
||||
if job.Status != JobError {
|
||||
job.Status = JobDone
|
||||
}
|
||||
job.UpdatedAt = time.Now().UTC()
|
||||
job.FinishedAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
|
||||
@ -178,15 +178,26 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin
|
||||
}
|
||||
|
||||
deadline := time.Now().Add(12 * time.Minute)
|
||||
lastState := podState{Name: podName}
|
||||
for time.Now().Before(deadline) {
|
||||
state, err := a.remotePodState(kube, podName)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
lastState = state
|
||||
if strings.TrimSpace(jobID) != "" {
|
||||
a.setJob(jobID, func(_ *Job) {})
|
||||
}
|
||||
switch state.Phase {
|
||||
case "Succeeded":
|
||||
if strings.TrimSpace(state.Message) != "" {
|
||||
return strings.TrimSpace(state.Message), nil
|
||||
}
|
||||
return a.remotePodLogs(kube, podName)
|
||||
case "Failed":
|
||||
if strings.TrimSpace(state.Message) != "" {
|
||||
return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(state.Message))
|
||||
}
|
||||
logs, _ := a.remotePodLogs(kube, podName)
|
||||
if strings.TrimSpace(logs) != "" {
|
||||
return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(logs))
|
||||
@ -195,6 +206,9 @@ func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (strin
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
if lastState.Phase != "" {
|
||||
return "", fmt.Errorf("remote pod %s timed out in phase %s: %s %s", podName, lastState.Phase, strings.TrimSpace(lastState.Reason), strings.TrimSpace(lastState.Message))
|
||||
}
|
||||
return "", fmt.Errorf("remote pod %s timed out", podName)
|
||||
}
|
||||
|
||||
@ -268,7 +282,11 @@ func (a *App) remotePodLogs(kube *kubeClient, podName string) (string, error) {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, strings.TrimSpace(string(body)))
|
||||
message := strings.TrimSpace(string(body))
|
||||
if strings.Contains(message, "proxy error from 127.0.0.1:6443") || strings.Contains(message, "containerLogs") {
|
||||
return "", fmt.Errorf("pod logs %s failed because Kubernetes could not reach the node kubelet log endpoint: %s", podName, message)
|
||||
}
|
||||
return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, message)
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
|
||||
if err != nil {
|
||||
|
||||
@ -580,6 +580,7 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
|
||||
const boot = JSON.parse(document.getElementById('boot').textContent);
|
||||
let state = boot;
|
||||
let busy = false;
|
||||
let lastJobAlert = '';
|
||||
|
||||
const nodeSelect = document.getElementById('node-select');
|
||||
const hostSelect = document.getElementById('host-select');
|
||||
@ -615,6 +616,22 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
|
||||
return size.toFixed(size >= 10 || idx === 0 ? 0 : 1) + ' ' + units[idx];
|
||||
}
|
||||
|
||||
function fmtDuration(startValue, endValue){
|
||||
if(!startValue){ return ''; }
|
||||
const start = new Date(startValue);
|
||||
if(isNaN(start.getTime())){ return ''; }
|
||||
const end = endValue ? new Date(endValue) : new Date();
|
||||
if(isNaN(end.getTime())){ return ''; }
|
||||
let seconds = Math.max(0, Math.round((end.getTime() - start.getTime()) / 1000));
|
||||
const hours = Math.floor(seconds / 3600);
|
||||
seconds -= hours * 3600;
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
seconds -= minutes * 60;
|
||||
if(hours){ return hours + 'h ' + minutes + 'm'; }
|
||||
if(minutes){ return minutes + 'm ' + seconds + 's'; }
|
||||
return seconds + 's';
|
||||
}
|
||||
|
||||
function banner(kind, title, text){
|
||||
bannerEl.className = 'banner ' + kind;
|
||||
bannerTitleEl.textContent = title;
|
||||
@ -667,6 +684,15 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
|
||||
const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : ''));
|
||||
const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : '');
|
||||
const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : '');
|
||||
const timingBits = [];
|
||||
if(job.stage){ timingBits.push('stage: ' + job.stage); }
|
||||
const duration = fmtDuration(job.started_at, job.finished_at);
|
||||
if(duration){
|
||||
timingBits.push((job.status === 'running' ? 'elapsed ' : 'duration ') + duration);
|
||||
}
|
||||
if(job.updated_at && job.status === 'running'){
|
||||
timingBits.push('last update ' + fmtDuration(job.updated_at, new Date().toISOString()) + ' ago');
|
||||
}
|
||||
const detailBits = [];
|
||||
if(job.written_bytes){ detailBits.push(fmtBytes(job.written_bytes) + ' / ' + fmtBytes(job.total_bytes)); }
|
||||
if(job.artifact){ detailBits.push(job.artifact); }
|
||||
@ -678,10 +704,20 @@ var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
|
||||
'</div>' +
|
||||
'<div>' + (job.message || job.stage || 'queued') + '</div>' +
|
||||
'<div class="meta">' + started + '</div>' +
|
||||
'<div class="meta">' + timingBits.join(' · ') + '</div>' +
|
||||
'<div class="meta">' + detailBits.join(' · ') + '</div>' +
|
||||
'<div class="bar"><span style="width:' + Math.max(0, Math.min(100, job.progress_pct || 0)) + '%"></span></div>';
|
||||
jobsEl.appendChild(wrap);
|
||||
});
|
||||
|
||||
const newestError = jobs.find((job)=>job.status === 'error');
|
||||
if(newestError){
|
||||
const signature = [newestError.id, newestError.error || newestError.message || newestError.stage || 'error'].join(':');
|
||||
if(signature !== lastJobAlert){
|
||||
lastJobAlert = signature;
|
||||
banner('error', 'Metis job failed', newestError.error || newestError.message || 'Check the live jobs panel for details.');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function renderEvents(){
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user