356 lines
12 KiB
Go
356 lines
12 KiB
Go
package cluster
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
// EstimatedShutdownSeconds runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) EstimatedShutdownSeconds() int.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) EstimatedShutdownSeconds() int {
|
|
return o.store.ShutdownP95WithMinSamples(o.cfg.Shutdown.DefaultBudgetSeconds, o.cfg.Shutdown.HistoryMinSamples)
|
|
}
|
|
|
|
// EstimatedEmergencyShutdownSeconds runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) EstimatedEmergencyShutdownSeconds() int {
|
|
return o.store.ShutdownP95ByReasonPrefix(
|
|
o.cfg.Shutdown.EmergencyBudgetSec,
|
|
o.cfg.Shutdown.EmergencyMinSamples,
|
|
[]string{"ups-", "emergency-", "drill-emergency"},
|
|
)
|
|
}
|
|
|
|
// finalizeRecord runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) finalizeRecord(record *state.RunRecord, err *error) {
|
|
record.EndedAt = time.Now().UTC()
|
|
record.DurationSeconds = int(record.EndedAt.Sub(record.StartedAt).Seconds())
|
|
record.Success = *err == nil
|
|
if *err != nil {
|
|
record.Error = (*err).Error()
|
|
}
|
|
if appendErr := o.store.Append(*record); appendErr != nil {
|
|
o.log.Printf("warning: append run record failed: %v", appendErr)
|
|
}
|
|
if artifactErr := o.writeRunRecordArtifact(*record); artifactErr != nil {
|
|
o.log.Printf("warning: write run artifact failed: %v", artifactErr)
|
|
}
|
|
}
|
|
|
|
// startupReportPath runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) startupReportPath() string.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) startupReportPath() string {
|
|
return filepath.Join(o.cfg.State.Dir, "last-startup-report.json")
|
|
}
|
|
|
|
// startupProgressPath runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) startupProgressPath() string.
|
|
// Why: status polling during long startup drills needs a live, on-disk snapshot
|
|
// instead of waiting for final completion artifacts.
|
|
func (o *Orchestrator) startupProgressPath() string {
|
|
return filepath.Join(o.cfg.State.Dir, "startup-progress.json")
|
|
}
|
|
|
|
// lastShutdownReportPath runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) lastShutdownReportPath() string.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) lastShutdownReportPath() string {
|
|
return filepath.Join(o.cfg.State.Dir, "last-shutdown-report.json")
|
|
}
|
|
|
|
// reportArchiveDir runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) reportArchiveDir() string.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) reportArchiveDir() string {
|
|
dir := strings.TrimSpace(o.cfg.State.ReportsDir)
|
|
if dir != "" {
|
|
return dir
|
|
}
|
|
return filepath.Join(o.cfg.State.Dir, "reports")
|
|
}
|
|
|
|
// writeRunRecordArtifact runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error.
|
|
// Why: Every drill should leave durable timing and outcome evidence so we can
|
|
// compare runs and tighten automation without relying on terminal history.
|
|
func (o *Orchestrator) writeRunRecordArtifact(record state.RunRecord) error {
|
|
dir := o.reportArchiveDir()
|
|
if err := os.MkdirAll(dir, 0o750); err != nil {
|
|
return fmt.Errorf("ensure report archive dir %s: %w", dir, err)
|
|
}
|
|
blob, err := json.MarshalIndent(record, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("encode run record %s: %w", record.ID, err)
|
|
}
|
|
reportPath := filepath.Join(dir, sanitizeReportFileName(record.ID)+".json")
|
|
if err := os.WriteFile(reportPath, blob, 0o640); err != nil {
|
|
return fmt.Errorf("write run artifact %s: %w", reportPath, err)
|
|
}
|
|
if record.Action == "shutdown" {
|
|
if err := os.WriteFile(o.lastShutdownReportPath(), blob, 0o640); err != nil {
|
|
return fmt.Errorf("write last shutdown report: %w", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// sanitizeReportFileName runs one orchestration or CLI step.
|
|
// Signature: sanitizeReportFileName(value string) string.
|
|
// Why: IDs can include punctuation from external inputs; sanitize to keep report
|
|
// filenames portable and predictable across hosts.
|
|
func sanitizeReportFileName(value string) string {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
return "run"
|
|
}
|
|
var b strings.Builder
|
|
b.Grow(len(value))
|
|
for _, r := range value {
|
|
switch {
|
|
case r >= 'a' && r <= 'z':
|
|
b.WriteRune(r)
|
|
case r >= 'A' && r <= 'Z':
|
|
b.WriteRune(r)
|
|
case r >= '0' && r <= '9':
|
|
b.WriteRune(r)
|
|
case r == '-', r == '_':
|
|
b.WriteRune(r)
|
|
default:
|
|
b.WriteRune('_')
|
|
}
|
|
}
|
|
cleaned := strings.Trim(b.String(), "_")
|
|
if cleaned == "" {
|
|
return "run"
|
|
}
|
|
return cleaned
|
|
}
|
|
|
|
// cloneStartupReport runs one orchestration or CLI step.
|
|
// Signature: cloneStartupReport(report *startupReport) *startupReport.
|
|
// Why: a status poll can occur while startup is active; clone protects readers
|
|
// from observing partially-mutated map/slice state.
|
|
func cloneStartupReport(report *startupReport) *startupReport {
|
|
if report == nil {
|
|
return nil
|
|
}
|
|
cloned := *report
|
|
cloned.Checks = make(map[string]startupCheckRecord, len(report.Checks))
|
|
for name, check := range report.Checks {
|
|
cloned.Checks[name] = check
|
|
}
|
|
cloned.AutoHeals = append([]string{}, report.AutoHeals...)
|
|
return &cloned
|
|
}
|
|
|
|
// writeStartupReportFile runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error.
|
|
// Why: startup status needs one shared serializer so progress and final reports
|
|
// stay consistent for both humans and automation.
|
|
func (o *Orchestrator) writeStartupReportFile(path string, report *startupReport) error {
|
|
if report == nil {
|
|
return nil
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
|
|
return fmt.Errorf("ensure startup report dir %s: %w", filepath.Dir(path), err)
|
|
}
|
|
b, err := json.MarshalIndent(report, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("encode startup report: %w", err)
|
|
}
|
|
if err := os.WriteFile(path, b, 0o640); err != nil {
|
|
return fmt.Errorf("write startup report %s: %w", path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// persistStartupProgress runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) persistStartupProgress(report *startupReport).
|
|
// Why: status polling must reflect in-flight checklist changes instead of only
|
|
// a terminal success/failure snapshot.
|
|
func (o *Orchestrator) persistStartupProgress(report *startupReport) {
|
|
if report == nil {
|
|
return
|
|
}
|
|
if err := o.writeStartupReportFile(o.startupProgressPath(), report); err != nil {
|
|
o.log.Printf("warning: persist startup progress failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// setStartupPhase runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) setStartupPhase(phase string, detail string).
|
|
// Why: operators need to see where startup is paused without reading controller
|
|
// logs line-by-line during a drill.
|
|
func (o *Orchestrator) setStartupPhase(phase string, detail string) {
|
|
phase = strings.TrimSpace(phase)
|
|
detail = strings.TrimSpace(detail)
|
|
o.startupReportMu.Lock()
|
|
if o.activeStartupReport == nil {
|
|
o.startupReportMu.Unlock()
|
|
return
|
|
}
|
|
if phase != "" {
|
|
o.activeStartupReport.Phase = phase
|
|
}
|
|
now := time.Now().UTC()
|
|
o.activeStartupReport.LastUpdated = now
|
|
if detail != "" {
|
|
o.activeStartupReport.Checks["phase"] = startupCheckRecord{
|
|
Status: "running",
|
|
Detail: detail,
|
|
UpdatedAt: now,
|
|
}
|
|
}
|
|
snapshot := cloneStartupReport(o.activeStartupReport)
|
|
o.startupReportMu.Unlock()
|
|
o.persistStartupProgress(snapshot)
|
|
}
|
|
|
|
// beginStartupReport runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) beginStartupReport(reason string).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) beginStartupReport(reason string) {
|
|
host, _ := os.Hostname()
|
|
o.startupReportMu.Lock()
|
|
now := time.Now().UTC()
|
|
o.activeStartupReport = &startupReport{
|
|
StartedAt: now,
|
|
Reason: strings.TrimSpace(reason),
|
|
Status: "running",
|
|
Phase: "startup-initializing",
|
|
Checks: map[string]startupCheckRecord{},
|
|
AutoHeals: []string{},
|
|
SourceHost: strings.TrimSpace(host),
|
|
LastUpdated: now,
|
|
}
|
|
snapshot := cloneStartupReport(o.activeStartupReport)
|
|
o.startupReportMu.Unlock()
|
|
o.persistStartupProgress(snapshot)
|
|
}
|
|
|
|
// noteStartupCheckState runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) noteStartupCheckState(name, status, detail string).
|
|
// Why: startup checks can be running, passed, or failed; explicit state improves
|
|
// checklist introspection from the CLI while startup is still executing.
|
|
func (o *Orchestrator) noteStartupCheckState(name, status, detail string) {
|
|
name = strings.TrimSpace(name)
|
|
status = strings.TrimSpace(status)
|
|
detail = strings.TrimSpace(detail)
|
|
if name == "" {
|
|
return
|
|
}
|
|
switch status {
|
|
case "passed", "failed", "running":
|
|
default:
|
|
status = "running"
|
|
}
|
|
o.startupReportMu.Lock()
|
|
if o.activeStartupReport == nil {
|
|
o.startupReportMu.Unlock()
|
|
return
|
|
}
|
|
now := time.Now().UTC()
|
|
o.activeStartupReport.LastUpdated = now
|
|
o.activeStartupReport.Checks[name] = startupCheckRecord{
|
|
Status: status,
|
|
Detail: detail,
|
|
UpdatedAt: now,
|
|
}
|
|
snapshot := cloneStartupReport(o.activeStartupReport)
|
|
o.startupReportMu.Unlock()
|
|
o.persistStartupProgress(snapshot)
|
|
}
|
|
|
|
// noteStartupCheck runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) noteStartupCheck(name string, success bool, detail string).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) noteStartupCheck(name string, success bool, detail string) {
|
|
status := "failed"
|
|
if success {
|
|
status = "passed"
|
|
}
|
|
o.noteStartupCheckState(name, status, detail)
|
|
}
|
|
|
|
// noteStartupAutoHeal runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) noteStartupAutoHeal(detail string).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) noteStartupAutoHeal(detail string) {
|
|
o.startupReportMu.Lock()
|
|
if o.activeStartupReport == nil {
|
|
o.startupReportMu.Unlock()
|
|
return
|
|
}
|
|
detail = strings.TrimSpace(detail)
|
|
if detail == "" {
|
|
o.startupReportMu.Unlock()
|
|
return
|
|
}
|
|
o.activeStartupReport.AutoHeals = append(o.activeStartupReport.AutoHeals, detail)
|
|
o.activeStartupReport.LastUpdated = time.Now().UTC()
|
|
snapshot := cloneStartupReport(o.activeStartupReport)
|
|
o.startupReportMu.Unlock()
|
|
o.persistStartupProgress(snapshot)
|
|
}
|
|
|
|
// finalizeStartupReportSnapshot runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport) *startupReport.
|
|
// Why: completion metadata should be consistent across both final and progress
|
|
// report files so operators can trust whichever file they read.
|
|
func (o *Orchestrator) finalizeStartupReportSnapshot(report *startupReport, runErr error) *startupReport {
|
|
report = cloneStartupReport(report)
|
|
if report == nil {
|
|
return nil
|
|
}
|
|
report.Completed = time.Now().UTC()
|
|
report.Success = runErr == nil
|
|
report.Status = "success"
|
|
report.Phase = "complete"
|
|
report.LastUpdated = report.Completed
|
|
if runErr != nil {
|
|
report.Status = "failed"
|
|
report.Phase = "failed"
|
|
report.Error = strings.TrimSpace(runErr.Error())
|
|
}
|
|
return report
|
|
}
|
|
|
|
// finalizeStartupReport runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) finalizeStartupReport(runErr error).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (o *Orchestrator) finalizeStartupReport(runErr error) {
|
|
o.startupReportMu.Lock()
|
|
report := o.activeStartupReport
|
|
o.activeStartupReport = nil
|
|
o.startupReportMu.Unlock()
|
|
if report == nil {
|
|
return
|
|
}
|
|
report = o.finalizeStartupReportSnapshot(report, runErr)
|
|
path := o.startupReportPath()
|
|
if writeErr := o.writeStartupReportFile(path, report); writeErr != nil {
|
|
o.log.Printf("warning: %v", writeErr)
|
|
return
|
|
}
|
|
o.persistStartupProgress(report)
|
|
archivePath := filepath.Join(
|
|
o.reportArchiveDir(),
|
|
fmt.Sprintf("startup-report-%d.json", report.StartedAt.UnixNano()),
|
|
)
|
|
if writeErr := o.writeStartupReportFile(archivePath, report); writeErr != nil {
|
|
o.log.Printf("warning: %v", writeErr)
|
|
}
|
|
o.log.Printf("startup report written: %s", path)
|
|
}
|