ananke/internal/state/store.go

255 lines
7.5 KiB
Go

package state
import (
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"syscall"
"time"
)
type RunRecord struct {
ID string `json:"id"`
Action string `json:"action"`
Reason string `json:"reason,omitempty"`
DryRun bool `json:"dry_run,omitempty"`
StartedAt time.Time `json:"started_at"`
EndedAt time.Time `json:"ended_at"`
DurationSeconds int `json:"duration_seconds"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
type Store struct {
path string
mu sync.Mutex
}
// New runs one orchestration or CLI step.
// Signature: New(path string) *Store.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func New(path string) *Store {
return &Store{path: path}
}
// EnsureDir runs one orchestration or CLI step.
// Signature: EnsureDir(dir string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func EnsureDir(dir string) error {
if dir == "" {
return fmt.Errorf("state dir must not be empty")
}
return os.MkdirAll(dir, 0o750)
}
// AcquireLock runs one orchestration or CLI step.
// Signature: AcquireLock(path string) (func(), error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func AcquireLock(path string) (func(), error) {
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return nil, err
}
create := func() (func(), error) {
f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
if err != nil {
return nil, err
}
_, _ = f.WriteString(fmt.Sprintf("pid=%d started=%s\n", os.Getpid(), time.Now().Format(time.RFC3339)))
_ = f.Close()
return func() {
_ = os.Remove(path)
}, nil
}
unlock, err := create()
if err == nil {
return unlock, nil
}
if !errors.Is(err, os.ErrExist) {
return nil, fmt.Errorf("acquire lock %s: %w", path, err)
}
stale, staleErr := staleLock(path)
if staleErr != nil {
return nil, fmt.Errorf("acquire lock %s: existing lock check failed: %w", path, staleErr)
}
if !stale {
return nil, fmt.Errorf("acquire lock %s: lock is held by active process", path)
}
if rmErr := os.Remove(path); rmErr != nil {
return nil, fmt.Errorf("acquire lock %s: remove stale lock: %w", path, rmErr)
}
unlock, err = create()
if err != nil {
return nil, fmt.Errorf("acquire lock %s: recreate after stale lock removal: %w", path, err)
}
return unlock, nil
}
// staleLock runs one orchestration or CLI step.
// Signature: staleLock(path string) (bool, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func staleLock(path string) (bool, error) {
b, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return true, nil
}
return false, err
}
lines := strings.Split(string(b), "\n")
var pid int
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "pid=") {
v := strings.TrimPrefix(line, "pid=")
if fields := strings.Fields(v); len(fields) > 0 {
v = fields[0]
}
parsed, parseErr := strconv.Atoi(v)
if parseErr != nil {
return true, nil
}
pid = parsed
break
}
}
if pid <= 0 {
return true, nil
}
if err := syscall.Kill(pid, 0); err != nil {
if errors.Is(err, syscall.ESRCH) {
return true, nil
}
}
return false, nil
}
// Append runs one orchestration or CLI step.
// Signature: (s *Store) Append(record RunRecord) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) Append(record RunRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
records, err := s.loadUnlocked()
if err != nil {
return err
}
records = append(records, record)
if len(records) > 200 {
records = records[len(records)-200:]
}
if err := os.MkdirAll(filepath.Dir(s.path), 0o750); err != nil {
return err
}
b, _ := json.MarshalIndent(records, "", " ")
return os.WriteFile(s.path, b, 0o640)
}
// Load runs one orchestration or CLI step.
// Signature: (s *Store) Load() ([]RunRecord, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) Load() ([]RunRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
return s.loadUnlocked()
}
// loadUnlocked runs one orchestration or CLI step.
// Signature: (s *Store) loadUnlocked() ([]RunRecord, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) loadUnlocked() ([]RunRecord, error) {
b, err := os.ReadFile(s.path)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, err
}
if len(b) == 0 {
return nil, nil
}
var records []RunRecord
if err := json.Unmarshal(b, &records); err != nil {
if healErr := quarantineCorruptFile(s.path, b, []byte("[]\n"), 0o640); healErr != nil {
return nil, fmt.Errorf("decode run history: %w (auto-heal failed: %v)", err, healErr)
}
return nil, nil
}
return records, nil
}
// ShutdownP95 runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95(defaultSeconds int) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95(defaultSeconds int) int {
return s.shutdownP95(defaultSeconds, 1, nil)
}
// ShutdownP95WithMinSamples runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int {
return s.shutdownP95(defaultSeconds, minSamples, nil)
}
// ShutdownP95ByReasonPrefix runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
return s.shutdownP95(defaultSeconds, minSamples, reasonPrefixes)
}
// shutdownP95 runs one orchestration or CLI step.
// Signature: (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
if minSamples <= 0 {
minSamples = 1
}
records, err := s.Load()
if err != nil {
return defaultSeconds
}
var d []int
filteredPrefixes := make([]string, 0, len(reasonPrefixes))
for _, prefix := range reasonPrefixes {
p := strings.ToLower(strings.TrimSpace(prefix))
if p != "" {
filteredPrefixes = append(filteredPrefixes, p)
}
}
matchesPrefix := func(reason string) bool {
if len(filteredPrefixes) == 0 {
return true
}
r := strings.ToLower(strings.TrimSpace(reason))
for _, prefix := range filteredPrefixes {
if strings.HasPrefix(r, prefix) {
return true
}
}
return false
}
for _, r := range records {
if r.Action == "shutdown" && !r.DryRun && r.Success && r.DurationSeconds > 0 && matchesPrefix(r.Reason) {
d = append(d, r.DurationSeconds)
}
}
if len(d) < minSamples {
return defaultSeconds
}
sort.Ints(d)
idx := int(math.Ceil(0.95*float64(len(d)))) - 1
return d[idx]
}