ananke/internal/cluster/orchestrator_report.go

356 lines
12 KiB
Go

package cluster
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// EstimatedShutdownSeconds runs one orchestration or CLI step.
// Signature: (o *Orchestrator) EstimatedShutdownSeconds() int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) EstimatedShutdownSeconds() int {
return o.store.ShutdownP95WithMinSamples(o.cfg.Shutdown.DefaultBudgetSeconds, o.cfg.Shutdown.HistoryMinSamples)
}
// EstimatedEmergencyShutdownSeconds runs one orchestration or CLI step.
// Signature: (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int {
return o.store.ShutdownP95ByReasonPrefix(
o.cfg.Shutdown.EmergencyBudgetSec,
o.cfg.Shutdown.EmergencyMinSamples,
[]string{"ups-", "emergency-", "drill-emergency"},
)
}
// finalizeRecord runs one orchestration or CLI step.
// Signature: (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error) {
record.EndedAt = time.Now().UTC()
record.DurationSeconds = int(record.EndedAt.Sub(record.StartedAt).Seconds())
record.Success = *err == nil
if *err != nil {
record.Error = (*err).Error()
}
if appendErr := o.store.Append(*record); appendErr != nil {
o.log.Printf("warning: append run record failed: %v", appendErr)
}
if artifactErr := o.writeRunRecordArtifact(*record); artifactErr != nil {
o.log.Printf("warning: write run artifact failed: %v", artifactErr)
}
}
// startupReportPath runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupReportPath() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupReportPath() string {
return filepath.Join(o.cfg.State.Dir, "last-startup-report.json")
}
// startupProgressPath runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupProgressPath() string.
// Why: status polling during long startup drills needs a live, on-disk snapshot
// instead of waiting for final completion artifacts.
func (o *Orchestrator) startupProgressPath() string {
return filepath.Join(o.cfg.State.Dir, "startup-progress.json")
}
// lastShutdownReportPath runs one orchestration or CLI step.
// Signature: (o *Orchestrator) lastShutdownReportPath() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) lastShutdownReportPath() string {
return filepath.Join(o.cfg.State.Dir, "last-shutdown-report.json")
}
// reportArchiveDir runs one orchestration or CLI step.
// Signature: (o *Orchestrator) reportArchiveDir() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) reportArchiveDir() string {
dir := strings.TrimSpace(o.cfg.State.ReportsDir)
if dir != "" {
return dir
}
return filepath.Join(o.cfg.State.Dir, "reports")
}
// writeRunRecordArtifact runs one orchestration or CLI step.
// Signature: (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error.
// Why: Every drill should leave durable timing and outcome evidence so we can
// compare runs and tighten automation without relying on terminal history.
func (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error {
dir := o.reportArchiveDir()
if err := os.MkdirAll(dir, 0o750); err != nil {
return fmt.Errorf("ensure report archive dir %s: %w", dir, err)
}
blob, err := json.MarshalIndent(record, "", " ")
if err != nil {
return fmt.Errorf("encode run record %s: %w", record.ID, err)
}
reportPath := filepath.Join(dir, sanitizeReportFileName(record.ID)+".json")
if err := os.WriteFile(reportPath, blob, 0o640); err != nil {
return fmt.Errorf("write run artifact %s: %w", reportPath, err)
}
if record.Action == "shutdown" {
if err := os.WriteFile(o.lastShutdownReportPath(), blob, 0o640); err != nil {
return fmt.Errorf("write last shutdown report: %w", err)
}
}
return nil
}
// sanitizeReportFileName runs one orchestration or CLI step.
// Signature: sanitizeReportFileName(value string) string.
// Why: IDs can include punctuation from external inputs; sanitize to keep report
// filenames portable and predictable across hosts.
func sanitizeReportFileName(value string) string {
value = strings.TrimSpace(value)
if value == "" {
return "run"
}
var b strings.Builder
b.Grow(len(value))
for _, r := range value {
switch {
case r >= 'a' && r <= 'z':
b.WriteRune(r)
case r >= 'A' && r <= 'Z':
b.WriteRune(r)
case r >= '0' && r <= '9':
b.WriteRune(r)
case r == '-', r == '_':
b.WriteRune(r)
default:
b.WriteRune('_')
}
}
cleaned := strings.Trim(b.String(), "_")
if cleaned == "" {
return "run"
}
return cleaned
}
// cloneStartupReport runs one orchestration or CLI step.
// Signature: cloneStartupReport(report *startupReport) *startupReport.
// Why: a status poll can occur while startup is active; clone protects readers
// from observing partially-mutated map/slice state.
func cloneStartupReport(report *startupReport) *startupReport {
if report == nil {
return nil
}
cloned := *report
cloned.Checks = make(map[string]startupCheckRecord, len(report.Checks))
for name, check := range report.Checks {
cloned.Checks[name] = check
}
cloned.AutoHeals = append([]string{}, report.AutoHeals...)
return &cloned
}
// writeStartupReportFile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error.
// Why: startup status needs one shared serializer so progress and final reports
// stay consistent for both humans and automation.
func (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error {
if report == nil {
return nil
}
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return fmt.Errorf("ensure startup report dir %s: %w", filepath.Dir(path), err)
}
b, err := json.MarshalIndent(report, "", " ")
if err != nil {
return fmt.Errorf("encode startup report: %w", err)
}
if err := os.WriteFile(path, b, 0o640); err != nil {
return fmt.Errorf("write startup report %s: %w", path, err)
}
return nil
}
// persistStartupProgress runs one orchestration or CLI step.
// Signature: (o *Orchestrator) persistStartupProgress(report *startupReport).
// Why: status polling must reflect in-flight checklist changes instead of only
// a terminal success/failure snapshot.
func (o *Orchestrator) persistStartupProgress(report *startupReport) {
if report == nil {
return
}
if err := o.writeStartupReportFile(o.startupProgressPath(), report); err != nil {
o.log.Printf("warning: persist startup progress failed: %v", err)
}
}
// setStartupPhase runs one orchestration or CLI step.
// Signature: (o *Orchestrator) setStartupPhase(phase string, detail string).
// Why: operators need to see where startup is paused without reading controller
// logs line-by-line during a drill.
func (o *Orchestrator) setStartupPhase(phase string, detail string) {
phase = strings.TrimSpace(phase)
detail = strings.TrimSpace(detail)
o.startupReportMu.Lock()
if o.activeStartupReport == nil {
o.startupReportMu.Unlock()
return
}
if phase != "" {
o.activeStartupReport.Phase = phase
}
now := time.Now().UTC()
o.activeStartupReport.LastUpdated = now
if detail != "" {
o.activeStartupReport.Checks["phase"] = startupCheckRecord{
Status: "running",
Detail: detail,
UpdatedAt: now,
}
}
snapshot := cloneStartupReport(o.activeStartupReport)
o.startupReportMu.Unlock()
o.persistStartupProgress(snapshot)
}
// beginStartupReport runs one orchestration or CLI step.
// Signature: (o *Orchestrator) beginStartupReport(reason string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) beginStartupReport(reason string) {
host, _ := os.Hostname()
o.startupReportMu.Lock()
now := time.Now().UTC()
o.activeStartupReport = &startupReport{
StartedAt: now,
Reason: strings.TrimSpace(reason),
Status: "running",
Phase: "startup-initializing",
Checks: map[string]startupCheckRecord{},
AutoHeals: []string{},
SourceHost: strings.TrimSpace(host),
LastUpdated: now,
}
snapshot := cloneStartupReport(o.activeStartupReport)
o.startupReportMu.Unlock()
o.persistStartupProgress(snapshot)
}
// noteStartupCheckState runs one orchestration or CLI step.
// Signature: (o *Orchestrator) noteStartupCheckState(name, status, detail string).
// Why: startup checks can be running, passed, or failed; explicit state improves
// checklist introspection from the CLI while startup is still executing.
func (o *Orchestrator) noteStartupCheckState(name, status, detail string) {
name = strings.TrimSpace(name)
status = strings.TrimSpace(status)
detail = strings.TrimSpace(detail)
if name == "" {
return
}
switch status {
case "passed", "failed", "running":
default:
status = "running"
}
o.startupReportMu.Lock()
if o.activeStartupReport == nil {
o.startupReportMu.Unlock()
return
}
now := time.Now().UTC()
o.activeStartupReport.LastUpdated = now
o.activeStartupReport.Checks[name] = startupCheckRecord{
Status: status,
Detail: detail,
UpdatedAt: now,
}
snapshot := cloneStartupReport(o.activeStartupReport)
o.startupReportMu.Unlock()
o.persistStartupProgress(snapshot)
}
// noteStartupCheck runs one orchestration or CLI step.
// Signature: (o *Orchestrator) noteStartupCheck(name string, success bool, detail string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) noteStartupCheck(name string, success bool, detail string) {
status := "failed"
if success {
status = "passed"
}
o.noteStartupCheckState(name, status, detail)
}
// noteStartupAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) noteStartupAutoHeal(detail string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) noteStartupAutoHeal(detail string) {
o.startupReportMu.Lock()
if o.activeStartupReport == nil {
o.startupReportMu.Unlock()
return
}
detail = strings.TrimSpace(detail)
if detail == "" {
o.startupReportMu.Unlock()
return
}
o.activeStartupReport.AutoHeals = append(o.activeStartupReport.AutoHeals, detail)
o.activeStartupReport.LastUpdated = time.Now().UTC()
snapshot := cloneStartupReport(o.activeStartupReport)
o.startupReportMu.Unlock()
o.persistStartupProgress(snapshot)
}
// finalizeStartupReportSnapshot runs one orchestration or CLI step.
// Signature: (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport) *startupReport.
// Why: completion metadata should be consistent across both final and progress
// report files so operators can trust whichever file they read.
func (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport, runErr error) *startupReport {
report = cloneStartupReport(report)
if report == nil {
return nil
}
report.Completed = time.Now().UTC()
report.Success = runErr == nil
report.Status = "success"
report.Phase = "complete"
report.LastUpdated = report.Completed
if runErr != nil {
report.Status = "failed"
report.Phase = "failed"
report.Error = strings.TrimSpace(runErr.Error())
}
return report
}
// finalizeStartupReport runs one orchestration or CLI step.
// Signature: (o *Orchestrator) finalizeStartupReport(runErr error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) finalizeStartupReport(runErr error) {
o.startupReportMu.Lock()
report := o.activeStartupReport
o.activeStartupReport = nil
o.startupReportMu.Unlock()
if report == nil {
return
}
report = o.finalizeStartupReportSnapshot(report, runErr)
path := o.startupReportPath()
if writeErr := o.writeStartupReportFile(path, report); writeErr != nil {
o.log.Printf("warning: %v", writeErr)
return
}
o.persistStartupProgress(report)
archivePath := filepath.Join(
o.reportArchiveDir(),
fmt.Sprintf("startup-report-%d.json", report.StartedAt.UnixNano()),
)
if writeErr := o.writeStartupReportFile(archivePath, report); writeErr != nil {
o.log.Printf("warning: %v", writeErr)
}
o.log.Printf("startup report written: %s", path)
}