package cluster import ( "encoding/json" "fmt" "os" "path/filepath" "strings" "time" "scm.bstein.dev/bstein/ananke/internal/state" ) // EstimatedShutdownSeconds runs one orchestration or CLI step. // Signature: (o *Orchestrator) EstimatedShutdownSeconds() int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) EstimatedShutdownSeconds() int { return o.store.ShutdownP95WithMinSamples(o.cfg.Shutdown.DefaultBudgetSeconds, o.cfg.Shutdown.HistoryMinSamples) } // EstimatedEmergencyShutdownSeconds runs one orchestration or CLI step. // Signature: (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int { return o.store.ShutdownP95ByReasonPrefix( o.cfg.Shutdown.EmergencyBudgetSec, o.cfg.Shutdown.EmergencyMinSamples, []string{"ups-", "emergency-", "drill-emergency"}, ) } // finalizeRecord runs one orchestration or CLI step. // Signature: (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error) { record.EndedAt = time.Now().UTC() record.DurationSeconds = int(record.EndedAt.Sub(record.StartedAt).Seconds()) record.Success = *err == nil if *err != nil { record.Error = (*err).Error() } if appendErr := o.store.Append(*record); appendErr != nil { o.log.Printf("warning: append run record failed: %v", appendErr) } if artifactErr := o.writeRunRecordArtifact(*record); artifactErr != nil { o.log.Printf("warning: write run artifact failed: %v", artifactErr) } } // startupReportPath runs one orchestration or CLI step. // Signature: (o *Orchestrator) startupReportPath() string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) startupReportPath() string { return filepath.Join(o.cfg.State.Dir, "last-startup-report.json") } // startupProgressPath runs one orchestration or CLI step. // Signature: (o *Orchestrator) startupProgressPath() string. // Why: status polling during long startup drills needs a live, on-disk snapshot // instead of waiting for final completion artifacts. func (o *Orchestrator) startupProgressPath() string { return filepath.Join(o.cfg.State.Dir, "startup-progress.json") } // lastShutdownReportPath runs one orchestration or CLI step. // Signature: (o *Orchestrator) lastShutdownReportPath() string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) lastShutdownReportPath() string { return filepath.Join(o.cfg.State.Dir, "last-shutdown-report.json") } // reportArchiveDir runs one orchestration or CLI step. // Signature: (o *Orchestrator) reportArchiveDir() string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) reportArchiveDir() string { dir := strings.TrimSpace(o.cfg.State.ReportsDir) if dir != "" { return dir } return filepath.Join(o.cfg.State.Dir, "reports") } // writeRunRecordArtifact runs one orchestration or CLI step. // Signature: (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error. // Why: Every drill should leave durable timing and outcome evidence so we can // compare runs and tighten automation without relying on terminal history. func (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error { dir := o.reportArchiveDir() if err := os.MkdirAll(dir, 0o750); err != nil { return fmt.Errorf("ensure report archive dir %s: %w", dir, err) } blob, err := json.MarshalIndent(record, "", " ") if err != nil { return fmt.Errorf("encode run record %s: %w", record.ID, err) } reportPath := filepath.Join(dir, sanitizeReportFileName(record.ID)+".json") if err := os.WriteFile(reportPath, blob, 0o640); err != nil { return fmt.Errorf("write run artifact %s: %w", reportPath, err) } if record.Action == "shutdown" { if err := os.WriteFile(o.lastShutdownReportPath(), blob, 0o640); err != nil { return fmt.Errorf("write last shutdown report: %w", err) } } return nil } // sanitizeReportFileName runs one orchestration or CLI step. // Signature: sanitizeReportFileName(value string) string. // Why: IDs can include punctuation from external inputs; sanitize to keep report // filenames portable and predictable across hosts. func sanitizeReportFileName(value string) string { value = strings.TrimSpace(value) if value == "" { return "run" } var b strings.Builder b.Grow(len(value)) for _, r := range value { switch { case r >= 'a' && r <= 'z': b.WriteRune(r) case r >= 'A' && r <= 'Z': b.WriteRune(r) case r >= '0' && r <= '9': b.WriteRune(r) case r == '-', r == '_': b.WriteRune(r) default: b.WriteRune('_') } } cleaned := strings.Trim(b.String(), "_") if cleaned == "" { return "run" } return cleaned } // cloneStartupReport runs one orchestration or CLI step. // Signature: cloneStartupReport(report *startupReport) *startupReport. // Why: a status poll can occur while startup is active; clone protects readers // from observing partially-mutated map/slice state. func cloneStartupReport(report *startupReport) *startupReport { if report == nil { return nil } cloned := *report cloned.Checks = make(map[string]startupCheckRecord, len(report.Checks)) for name, check := range report.Checks { cloned.Checks[name] = check } cloned.AutoHeals = append([]string{}, report.AutoHeals...) return &cloned } // writeStartupReportFile runs one orchestration or CLI step. // Signature: (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error. // Why: startup status needs one shared serializer so progress and final reports // stay consistent for both humans and automation. func (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error { if report == nil { return nil } if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { return fmt.Errorf("ensure startup report dir %s: %w", filepath.Dir(path), err) } b, err := json.MarshalIndent(report, "", " ") if err != nil { return fmt.Errorf("encode startup report: %w", err) } if err := os.WriteFile(path, b, 0o640); err != nil { return fmt.Errorf("write startup report %s: %w", path, err) } return nil } // persistStartupProgress runs one orchestration or CLI step. // Signature: (o *Orchestrator) persistStartupProgress(report *startupReport). // Why: status polling must reflect in-flight checklist changes instead of only // a terminal success/failure snapshot. func (o *Orchestrator) persistStartupProgress(report *startupReport) { if report == nil { return } if err := o.writeStartupReportFile(o.startupProgressPath(), report); err != nil { o.log.Printf("warning: persist startup progress failed: %v", err) } } // setStartupPhase runs one orchestration or CLI step. // Signature: (o *Orchestrator) setStartupPhase(phase string, detail string). // Why: operators need to see where startup is paused without reading controller // logs line-by-line during a drill. func (o *Orchestrator) setStartupPhase(phase string, detail string) { phase = strings.TrimSpace(phase) detail = strings.TrimSpace(detail) o.startupReportMu.Lock() if o.activeStartupReport == nil { o.startupReportMu.Unlock() return } if phase != "" { o.activeStartupReport.Phase = phase } now := time.Now().UTC() o.activeStartupReport.LastUpdated = now if detail != "" { o.activeStartupReport.Checks["phase"] = startupCheckRecord{ Status: "running", Detail: detail, UpdatedAt: now, } } snapshot := cloneStartupReport(o.activeStartupReport) o.startupReportMu.Unlock() o.persistStartupProgress(snapshot) } // beginStartupReport runs one orchestration or CLI step. // Signature: (o *Orchestrator) beginStartupReport(reason string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) beginStartupReport(reason string) { host, _ := os.Hostname() o.startupReportMu.Lock() now := time.Now().UTC() o.activeStartupReport = &startupReport{ StartedAt: now, Reason: strings.TrimSpace(reason), Status: "running", Phase: "startup-initializing", Checks: map[string]startupCheckRecord{}, AutoHeals: []string{}, SourceHost: strings.TrimSpace(host), LastUpdated: now, } snapshot := cloneStartupReport(o.activeStartupReport) o.startupReportMu.Unlock() o.persistStartupProgress(snapshot) } // noteStartupCheckState runs one orchestration or CLI step. // Signature: (o *Orchestrator) noteStartupCheckState(name, status, detail string). // Why: startup checks can be running, passed, or failed; explicit state improves // checklist introspection from the CLI while startup is still executing. func (o *Orchestrator) noteStartupCheckState(name, status, detail string) { name = strings.TrimSpace(name) status = strings.TrimSpace(status) detail = strings.TrimSpace(detail) if name == "" { return } switch status { case "passed", "failed", "running": default: status = "running" } o.startupReportMu.Lock() if o.activeStartupReport == nil { o.startupReportMu.Unlock() return } now := time.Now().UTC() o.activeStartupReport.LastUpdated = now o.activeStartupReport.Checks[name] = startupCheckRecord{ Status: status, Detail: detail, UpdatedAt: now, } snapshot := cloneStartupReport(o.activeStartupReport) o.startupReportMu.Unlock() o.persistStartupProgress(snapshot) } // noteStartupCheck runs one orchestration or CLI step. // Signature: (o *Orchestrator) noteStartupCheck(name string, success bool, detail string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) noteStartupCheck(name string, success bool, detail string) { status := "failed" if success { status = "passed" } o.noteStartupCheckState(name, status, detail) } // noteStartupAutoHeal runs one orchestration or CLI step. // Signature: (o *Orchestrator) noteStartupAutoHeal(detail string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) noteStartupAutoHeal(detail string) { o.startupReportMu.Lock() if o.activeStartupReport == nil { o.startupReportMu.Unlock() return } detail = strings.TrimSpace(detail) if detail == "" { o.startupReportMu.Unlock() return } o.activeStartupReport.AutoHeals = append(o.activeStartupReport.AutoHeals, detail) o.activeStartupReport.LastUpdated = time.Now().UTC() snapshot := cloneStartupReport(o.activeStartupReport) o.startupReportMu.Unlock() o.persistStartupProgress(snapshot) } // finalizeStartupReportSnapshot runs one orchestration or CLI step. // Signature: (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport) *startupReport. // Why: completion metadata should be consistent across both final and progress // report files so operators can trust whichever file they read. func (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport, runErr error) *startupReport { report = cloneStartupReport(report) if report == nil { return nil } report.Completed = time.Now().UTC() report.Success = runErr == nil report.Status = "success" report.Phase = "complete" report.LastUpdated = report.Completed if runErr != nil { report.Status = "failed" report.Phase = "failed" report.Error = strings.TrimSpace(runErr.Error()) } return report } // finalizeStartupReport runs one orchestration or CLI step. // Signature: (o *Orchestrator) finalizeStartupReport(runErr error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) finalizeStartupReport(runErr error) { o.startupReportMu.Lock() report := o.activeStartupReport o.activeStartupReport = nil o.startupReportMu.Unlock() if report == nil { return } report = o.finalizeStartupReportSnapshot(report, runErr) path := o.startupReportPath() if writeErr := o.writeStartupReportFile(path, report); writeErr != nil { o.log.Printf("warning: %v", writeErr) return } o.persistStartupProgress(report) archivePath := filepath.Join( o.reportArchiveDir(), fmt.Sprintf("startup-report-%d.json", report.StartedAt.UnixNano()), ) if writeErr := o.writeStartupReportFile(archivePath, report); writeErr != nil { o.log.Printf("warning: %v", writeErr) } o.log.Printf("startup report written: %s", path) }