package state import ( "encoding/json" "errors" "fmt" "math" "os" "path/filepath" "sort" "strconv" "strings" "sync" "syscall" "time" ) type RunRecord struct { ID string `json:"id"` Action string `json:"action"` Reason string `json:"reason,omitempty"` DryRun bool `json:"dry_run,omitempty"` StartedAt time.Time `json:"started_at"` EndedAt time.Time `json:"ended_at"` DurationSeconds int `json:"duration_seconds"` Success bool `json:"success"` Error string `json:"error,omitempty"` } type Store struct { path string mu sync.Mutex } // New runs one orchestration or CLI step. // Signature: New(path string) *Store. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func New(path string) *Store { return &Store{path: path} } // EnsureDir runs one orchestration or CLI step. // Signature: EnsureDir(dir string) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func EnsureDir(dir string) error { if dir == "" { return fmt.Errorf("state dir must not be empty") } return os.MkdirAll(dir, 0o750) } // AcquireLock runs one orchestration or CLI step. // Signature: AcquireLock(path string) (func(), error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func AcquireLock(path string) (func(), error) { if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { return nil, err } create := func() (func(), error) { f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return nil, err } _, _ = f.WriteString(fmt.Sprintf("pid=%d started=%s\n", os.Getpid(), time.Now().Format(time.RFC3339))) _ = f.Close() return func() { _ = os.Remove(path) }, nil } unlock, err := create() if err == nil { return unlock, nil } if !errors.Is(err, os.ErrExist) { return nil, fmt.Errorf("acquire lock %s: %w", path, err) } stale, staleErr := staleLock(path) if staleErr != nil { return nil, fmt.Errorf("acquire lock %s: existing lock check failed: %w", path, staleErr) } if !stale { return nil, fmt.Errorf("acquire lock %s: lock is held by active process", path) } if rmErr := os.Remove(path); rmErr != nil { return nil, fmt.Errorf("acquire lock %s: remove stale lock: %w", path, rmErr) } unlock, err = create() if err != nil { return nil, fmt.Errorf("acquire lock %s: recreate after stale lock removal: %w", path, err) } return unlock, nil } // staleLock runs one orchestration or CLI step. // Signature: staleLock(path string) (bool, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func staleLock(path string) (bool, error) { b, err := os.ReadFile(path) if err != nil { if os.IsNotExist(err) { return true, nil } return false, err } lines := strings.Split(string(b), "\n") var pid int for _, line := range lines { line = strings.TrimSpace(line) if strings.HasPrefix(line, "pid=") { v := strings.TrimPrefix(line, "pid=") if fields := strings.Fields(v); len(fields) > 0 { v = fields[0] } parsed, parseErr := strconv.Atoi(v) if parseErr != nil { return true, nil } pid = parsed break } } if pid <= 0 { return true, nil } if err := syscall.Kill(pid, 0); err != nil { if errors.Is(err, syscall.ESRCH) { return true, nil } } return false, nil } // Append runs one orchestration or CLI step. // Signature: (s *Store) Append(record RunRecord) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) Append(record RunRecord) error { s.mu.Lock() defer s.mu.Unlock() records, err := s.loadUnlocked() if err != nil { return err } records = append(records, record) if len(records) > 200 { records = records[len(records)-200:] } if err := os.MkdirAll(filepath.Dir(s.path), 0o750); err != nil { return err } b, _ := json.MarshalIndent(records, "", " ") return os.WriteFile(s.path, b, 0o640) } // Load runs one orchestration or CLI step. // Signature: (s *Store) Load() ([]RunRecord, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) Load() ([]RunRecord, error) { s.mu.Lock() defer s.mu.Unlock() return s.loadUnlocked() } // loadUnlocked runs one orchestration or CLI step. // Signature: (s *Store) loadUnlocked() ([]RunRecord, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) loadUnlocked() ([]RunRecord, error) { b, err := os.ReadFile(s.path) if err != nil { if os.IsNotExist(err) { return nil, nil } return nil, err } if len(b) == 0 { return nil, nil } var records []RunRecord if err := json.Unmarshal(b, &records); err != nil { if healErr := quarantineCorruptFile(s.path, b, []byte("[]\n"), 0o640); healErr != nil { return nil, fmt.Errorf("decode run history: %w (auto-heal failed: %v)", err, healErr) } return nil, nil } return records, nil } // ShutdownP95 runs one orchestration or CLI step. // Signature: (s *Store) ShutdownP95(defaultSeconds int) int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) ShutdownP95(defaultSeconds int) int { return s.shutdownP95(defaultSeconds, 1, nil) } // ShutdownP95WithMinSamples runs one orchestration or CLI step. // Signature: (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int { return s.shutdownP95(defaultSeconds, minSamples, nil) } // ShutdownP95ByReasonPrefix runs one orchestration or CLI step. // Signature: (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int { return s.shutdownP95(defaultSeconds, minSamples, reasonPrefixes) } // shutdownP95 runs one orchestration or CLI step. // Signature: (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int { if minSamples <= 0 { minSamples = 1 } records, err := s.Load() if err != nil { return defaultSeconds } var d []int filteredPrefixes := make([]string, 0, len(reasonPrefixes)) for _, prefix := range reasonPrefixes { p := strings.ToLower(strings.TrimSpace(prefix)) if p != "" { filteredPrefixes = append(filteredPrefixes, p) } } matchesPrefix := func(reason string) bool { if len(filteredPrefixes) == 0 { return true } r := strings.ToLower(strings.TrimSpace(reason)) for _, prefix := range filteredPrefixes { if strings.HasPrefix(r, prefix) { return true } } return false } for _, r := range records { if r.Action == "shutdown" && !r.DryRun && r.Success && r.DurationSeconds > 0 && matchesPrefix(r.Reason) { d = append(d, r.DurationSeconds) } } if len(d) < minSamples { return defaultSeconds } sort.Ints(d) idx := int(math.Ceil(0.95*float64(len(d)))) - 1 return d[idx] }