2026-01-31 03:34:34 -03:00
|
|
|
package server
|
|
|
|
|
|
|
|
|
|
import (
|
2026-04-12 11:09:49 -03:00
|
|
|
"context"
|
|
|
|
|
"log"
|
2026-01-31 03:34:34 -03:00
|
|
|
"net/http"
|
2026-02-06 18:25:19 -03:00
|
|
|
"strings"
|
2026-04-12 14:32:39 -03:00
|
|
|
"sync"
|
2026-02-06 18:25:19 -03:00
|
|
|
"time"
|
2026-01-31 03:34:34 -03:00
|
|
|
|
|
|
|
|
"scm.bstein.dev/bstein/soteria/internal/api"
|
|
|
|
|
"scm.bstein.dev/bstein/soteria/internal/config"
|
|
|
|
|
"scm.bstein.dev/bstein/soteria/internal/k8s"
|
2026-02-06 18:25:19 -03:00
|
|
|
"scm.bstein.dev/bstein/soteria/internal/longhorn"
|
2026-04-12 11:09:49 -03:00
|
|
|
|
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
2026-01-31 03:34:34 -03:00
|
|
|
)
|
|
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
type kubeClient interface {
|
|
|
|
|
ResolvePVCVolume(ctx context.Context, namespace, pvcName string) (string, *corev1.PersistentVolumeClaim, *corev1.PersistentVolume, error)
|
|
|
|
|
CreateBackupJob(ctx context.Context, cfg *config.Config, req api.BackupRequest) (string, string, error)
|
|
|
|
|
CreateRestoreJob(ctx context.Context, cfg *config.Config, req api.RestoreTestRequest) (string, string, error)
|
2026-04-13 12:03:14 -03:00
|
|
|
ListBackupJobs(ctx context.Context, namespace string) ([]k8s.BackupJobSummary, error)
|
2026-04-13 02:14:30 -03:00
|
|
|
ListBackupJobsForPVC(ctx context.Context, namespace, pvc string) ([]k8s.BackupJobSummary, error)
|
2026-04-13 12:51:19 -03:00
|
|
|
ReadBackupJobLog(ctx context.Context, namespace, jobName string) (string, error)
|
2026-04-12 11:09:49 -03:00
|
|
|
ListBoundPVCs(ctx context.Context) ([]k8s.PVCSummary, error)
|
|
|
|
|
PersistentVolumeClaimExists(ctx context.Context, namespace, pvcName string) (bool, error)
|
2026-04-12 14:32:39 -03:00
|
|
|
LoadSecretData(ctx context.Context, namespace, secretName, key string) ([]byte, error)
|
|
|
|
|
SaveSecretData(ctx context.Context, namespace, secretName, key string, value []byte, labels map[string]string) error
|
2026-04-12 11:09:49 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type longhornClient interface {
|
2026-04-13 00:35:36 -03:00
|
|
|
CreateSnapshot(ctx context.Context, volume, name string, labels map[string]string) error
|
2026-04-12 11:09:49 -03:00
|
|
|
SnapshotBackup(ctx context.Context, volume, name string, labels map[string]string, backupMode string) (*longhorn.Volume, error)
|
|
|
|
|
GetVolume(ctx context.Context, volume string) (*longhorn.Volume, error)
|
|
|
|
|
CreateVolumeFromBackup(ctx context.Context, name, size string, replicas int, backupURL string) (*longhorn.Volume, error)
|
|
|
|
|
CreatePVC(ctx context.Context, volumeName, namespace, pvcName string) error
|
|
|
|
|
DeleteVolume(ctx context.Context, volumeName string) error
|
|
|
|
|
FindBackup(ctx context.Context, volumeName, snapshot string) (*longhorn.Backup, error)
|
|
|
|
|
ListBackups(ctx context.Context, volumeName string) ([]longhorn.Backup, error)
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-21 06:45:04 -03:00
|
|
|
// Server owns HTTP routing, policy state, telemetry, and the UI renderer.
|
2026-01-31 03:34:34 -03:00
|
|
|
type Server struct {
|
2026-04-13 12:51:19 -03:00
|
|
|
cfg *config.Config
|
|
|
|
|
client kubeClient
|
|
|
|
|
longhorn longhornClient
|
|
|
|
|
metrics *telemetry
|
|
|
|
|
handler http.Handler
|
|
|
|
|
ui *uiRenderer
|
|
|
|
|
policyMu sync.RWMutex
|
|
|
|
|
policies map[string]api.BackupPolicy
|
|
|
|
|
runMu sync.Mutex
|
|
|
|
|
running bool
|
|
|
|
|
b2Mu sync.RWMutex
|
|
|
|
|
b2Usage api.B2UsageResponse
|
|
|
|
|
jobUsage map[string]resticJobUsageCacheEntry
|
|
|
|
|
jobUsageMu sync.RWMutex
|
2026-04-13 14:21:29 -03:00
|
|
|
usageMu sync.RWMutex
|
|
|
|
|
usageStore map[string]resticPersistedUsageEntry
|
2026-01-31 03:34:34 -03:00
|
|
|
}
|
|
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
type authIdentity struct {
|
|
|
|
|
Authenticated bool
|
|
|
|
|
User string
|
|
|
|
|
Email string
|
|
|
|
|
Groups []string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type ctxKey string
|
|
|
|
|
|
|
|
|
|
const authContextKey ctxKey = "soteria-auth"
|
|
|
|
|
|
2026-04-12 14:32:39 -03:00
|
|
|
const (
|
|
|
|
|
policySecretKey = "policies.json"
|
2026-04-13 14:21:29 -03:00
|
|
|
usageSecretKey = "restic-job-usage.json"
|
2026-04-12 14:32:39 -03:00
|
|
|
defaultPolicyHours = 24.0
|
|
|
|
|
maxPolicyIntervalHrs = 24 * 365
|
2026-04-13 13:55:17 -03:00
|
|
|
maxPolicyKeepLast = 1000
|
2026-04-13 12:51:19 -03:00
|
|
|
maxUsageSampleJobs = 20
|
|
|
|
|
resticSelectorPrefix = "restic-latest:"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type resticJobUsageCacheEntry struct {
|
|
|
|
|
Known bool
|
|
|
|
|
Bytes float64
|
|
|
|
|
CheckedAt time.Time
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-13 14:21:29 -03:00
|
|
|
type resticPersistedUsageEntry struct {
|
|
|
|
|
Bytes float64 `json:"bytes"`
|
|
|
|
|
UpdatedAt string `json:"updated_at,omitempty"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type resticPersistedUsageDocument struct {
|
|
|
|
|
Jobs []struct {
|
|
|
|
|
Key string `json:"key"`
|
|
|
|
|
Bytes float64 `json:"bytes"`
|
|
|
|
|
UpdatedAt string `json:"updated_at,omitempty"`
|
|
|
|
|
} `json:"jobs"`
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-21 06:45:04 -03:00
|
|
|
// New constructs a server with fresh telemetry and in-memory policy state.
|
2026-02-06 18:25:19 -03:00
|
|
|
func New(cfg *config.Config, client *k8s.Client, lh *longhorn.Client) *Server {
|
2026-01-31 03:34:34 -03:00
|
|
|
s := &Server{
|
2026-04-13 14:21:29 -03:00
|
|
|
cfg: cfg,
|
|
|
|
|
client: client,
|
|
|
|
|
longhorn: lh,
|
|
|
|
|
metrics: newTelemetry(),
|
|
|
|
|
ui: newUIRenderer(),
|
|
|
|
|
policies: map[string]api.BackupPolicy{},
|
|
|
|
|
jobUsage: map[string]resticJobUsageCacheEntry{},
|
|
|
|
|
usageStore: map[string]resticPersistedUsageEntry{},
|
2026-01-31 03:34:34 -03:00
|
|
|
}
|
2026-04-12 11:09:49 -03:00
|
|
|
s.handler = http.HandlerFunc(s.route)
|
|
|
|
|
return s
|
|
|
|
|
}
|
2026-01-31 03:34:34 -03:00
|
|
|
|
2026-04-21 06:45:04 -03:00
|
|
|
// Start launches telemetry and policy refresh loops for the active server.
|
2026-04-12 11:09:49 -03:00
|
|
|
func (s *Server) Start(ctx context.Context) {
|
2026-04-12 14:32:39 -03:00
|
|
|
if err := s.loadPolicies(ctx); err != nil {
|
|
|
|
|
log.Printf("policy load failed: %v", err)
|
|
|
|
|
}
|
2026-04-13 14:21:29 -03:00
|
|
|
if err := s.loadResticUsage(ctx); err != nil {
|
|
|
|
|
log.Printf("restic usage load failed: %v", err)
|
|
|
|
|
}
|
2026-04-12 14:32:39 -03:00
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
s.refreshTelemetry(ctx)
|
2026-04-12 19:45:23 -03:00
|
|
|
s.refreshB2Usage(ctx)
|
2026-04-12 14:32:39 -03:00
|
|
|
s.runPolicyCycle(ctx)
|
2026-01-31 03:34:34 -03:00
|
|
|
|
2026-04-12 14:32:39 -03:00
|
|
|
metricsTicker := time.NewTicker(s.cfg.MetricsRefreshInterval)
|
|
|
|
|
policyTicker := time.NewTicker(s.cfg.PolicyEvalInterval)
|
2026-04-12 19:45:23 -03:00
|
|
|
var b2Ticker *time.Ticker
|
|
|
|
|
var b2Tick <-chan time.Time
|
|
|
|
|
if s.cfg.B2Enabled {
|
|
|
|
|
b2Ticker = time.NewTicker(s.cfg.B2ScanInterval)
|
|
|
|
|
b2Tick = b2Ticker.C
|
|
|
|
|
}
|
2026-04-12 11:09:49 -03:00
|
|
|
go func() {
|
2026-04-12 14:32:39 -03:00
|
|
|
defer metricsTicker.Stop()
|
|
|
|
|
defer policyTicker.Stop()
|
2026-04-12 19:45:23 -03:00
|
|
|
if b2Ticker != nil {
|
|
|
|
|
defer b2Ticker.Stop()
|
|
|
|
|
}
|
2026-04-12 11:09:49 -03:00
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return
|
2026-04-12 14:32:39 -03:00
|
|
|
case <-metricsTicker.C:
|
2026-04-12 11:09:49 -03:00
|
|
|
s.refreshTelemetry(ctx)
|
2026-04-12 14:32:39 -03:00
|
|
|
case <-policyTicker.C:
|
|
|
|
|
s.runPolicyCycle(ctx)
|
2026-04-12 19:45:23 -03:00
|
|
|
case <-b2Tick:
|
|
|
|
|
s.refreshB2Usage(ctx)
|
2026-04-12 11:09:49 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
2026-01-31 03:34:34 -03:00
|
|
|
}
|
|
|
|
|
|
2026-04-21 06:45:04 -03:00
|
|
|
// Handler returns the HTTP handler used by the embedded server.
|
2026-01-31 03:34:34 -03:00
|
|
|
func (s *Server) Handler() http.Handler {
|
2026-04-12 11:09:49 -03:00
|
|
|
return s.handler
|
2026-01-31 03:34:34 -03:00
|
|
|
}
|
|
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
func (s *Server) route(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
switch r.URL.Path {
|
|
|
|
|
case "/healthz":
|
|
|
|
|
s.handleHealth(w, r)
|
|
|
|
|
return
|
|
|
|
|
case "/readyz":
|
|
|
|
|
s.handleReady(w, r)
|
|
|
|
|
return
|
|
|
|
|
case "/metrics":
|
|
|
|
|
s.metrics.Handler().ServeHTTP(w, r)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
identity, status, err := s.authorize(r)
|
|
|
|
|
if err != nil {
|
|
|
|
|
s.metrics.RecordAuthzDenied(authzReason(status, err))
|
|
|
|
|
writeError(w, status, err.Error())
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
r = r.WithContext(context.WithValue(r.Context(), authContextKey, identity))
|
|
|
|
|
|
|
|
|
|
switch r.URL.Path {
|
|
|
|
|
case "/":
|
|
|
|
|
s.handleUI(w, r)
|
2026-04-12 19:45:23 -03:00
|
|
|
case "/v1/b2":
|
|
|
|
|
s.handleB2Usage(w, r)
|
2026-04-12 11:09:49 -03:00
|
|
|
case "/v1/whoami":
|
|
|
|
|
s.handleWhoAmI(w, r)
|
|
|
|
|
case "/v1/inventory":
|
|
|
|
|
s.handleInventory(w, r)
|
|
|
|
|
case "/v1/backups":
|
|
|
|
|
s.handleBackups(w, r)
|
|
|
|
|
case "/v1/backup":
|
|
|
|
|
s.handleBackup(w, r)
|
2026-04-12 14:32:39 -03:00
|
|
|
case "/v1/backup/namespace":
|
|
|
|
|
s.handleNamespaceBackup(w, r)
|
2026-04-12 11:09:49 -03:00
|
|
|
case "/v1/restores", "/v1/restore-test":
|
|
|
|
|
s.handleRestore(w, r)
|
2026-04-12 14:32:39 -03:00
|
|
|
case "/v1/restores/namespace":
|
|
|
|
|
s.handleNamespaceRestore(w, r)
|
|
|
|
|
case "/v1/policies":
|
|
|
|
|
s.handlePolicies(w, r)
|
2026-04-12 11:09:49 -03:00
|
|
|
default:
|
2026-04-12 19:45:23 -03:00
|
|
|
if s.ui != nil && s.ui.ServeAsset(w, r) {
|
|
|
|
|
return
|
|
|
|
|
}
|
2026-04-12 14:32:39 -03:00
|
|
|
if strings.HasPrefix(r.URL.Path, "/v1/policies/") {
|
|
|
|
|
s.handlePolicyByID(w, r)
|
|
|
|
|
return
|
|
|
|
|
}
|
2026-04-12 19:45:23 -03:00
|
|
|
// Serve SPA index for deep links (for example /backup) while preserving
|
|
|
|
|
// explicit API and asset 404 behavior.
|
|
|
|
|
if r.Method == http.MethodGet && !strings.HasPrefix(r.URL.Path, "/v1/") && !strings.Contains(r.URL.Path, ".") {
|
|
|
|
|
s.handleUI(w, r)
|
|
|
|
|
return
|
|
|
|
|
}
|
2026-04-12 11:09:49 -03:00
|
|
|
writeError(w, http.StatusNotFound, "not found")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
2026-01-31 03:34:34 -03:00
|
|
|
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
func (s *Server) handleReady(w http.ResponseWriter, _ *http.Request) {
|
2026-01-31 03:34:34 -03:00
|
|
|
writeJSON(w, http.StatusOK, map[string]string{"status": "ready"})
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-12 11:09:49 -03:00
|
|
|
func (s *Server) handleUI(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
if r.Method != http.MethodGet {
|
|
|
|
|
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
|
|
|
|
return
|
|
|
|
|
}
|
2026-04-12 19:45:23 -03:00
|
|
|
if s.ui == nil {
|
|
|
|
|
writeError(w, http.StatusInternalServerError, "UI renderer is unavailable")
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if err := s.ui.ServeIndex(w, r); err != nil {
|
|
|
|
|
writeError(w, http.StatusInternalServerError, err.Error())
|
|
|
|
|
}
|
2026-04-12 11:09:49 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Server) handleWhoAmI(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
if r.Method != http.MethodGet {
|
|
|
|
|
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
identity := requesterFromContext(r.Context())
|
|
|
|
|
writeJSON(w, http.StatusOK, api.AuthInfoResponse{
|
|
|
|
|
Authenticated: identity.Authenticated,
|
|
|
|
|
User: identity.User,
|
|
|
|
|
Email: identity.Email,
|
|
|
|
|
Groups: identity.Groups,
|
2026-04-13 12:03:14 -03:00
|
|
|
AllowedGroups: s.cfg.AllowedGroups,
|
2026-04-12 11:09:49 -03:00
|
|
|
})
|
|
|
|
|
}
|