feat: add metis service and autonomous recovery path
This commit is contained in:
parent
26eb9af430
commit
b8f26ecf41
10
.dockerignore
Normal file
10
.dockerignore
Normal file
@ -0,0 +1,10 @@
|
||||
.git
|
||||
.gitignore
|
||||
AGENTS.md
|
||||
artifacts/
|
||||
build/
|
||||
tmp/
|
||||
*.img
|
||||
*.img.xz
|
||||
*.qcow2
|
||||
*.iso
|
||||
45
Dockerfile
Normal file
45
Dockerfile
Normal file
@ -0,0 +1,45 @@
|
||||
# syntax=docker/dockerfile:1.7
|
||||
|
||||
FROM golang:1.22-bookworm AS build
|
||||
|
||||
ARG TARGETOS=linux
|
||||
ARG TARGETARCH=arm64
|
||||
|
||||
WORKDIR /src
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
COPY . .
|
||||
RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||
--mount=type=cache,target=/go/pkg/mod \
|
||||
CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/metis ./cmd/metis && \
|
||||
CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/metis-sentinel ./cmd/metis-sentinel
|
||||
|
||||
FROM debian:bookworm-slim AS runtime-base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates e2fsprogs util-linux openssh-client xz-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY --from=build /out/metis /usr/local/bin/metis
|
||||
COPY --from=build /out/metis-sentinel /usr/local/bin/metis-sentinel
|
||||
COPY inventory.example.yaml /app/inventory.example.yaml
|
||||
COPY inventory.titan-rpi4.yaml /app/inventory.titan-rpi4.yaml
|
||||
COPY overlays /app/overlays
|
||||
|
||||
FROM runtime-base AS runtime
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
ENTRYPOINT ["metis"]
|
||||
CMD ["serve"]
|
||||
|
||||
FROM debian:bookworm-slim AS sentinel
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates util-linux \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=build /out/metis-sentinel /usr/local/bin/metis-sentinel
|
||||
|
||||
ENTRYPOINT ["metis-sentinel"]
|
||||
236
Jenkinsfile
vendored
Normal file
236
Jenkinsfile
vendored
Normal file
@ -0,0 +1,236 @@
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes {
|
||||
label 'metis'
|
||||
defaultContainer 'builder'
|
||||
yaml """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
labels:
|
||||
app: metis
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
imagePullSecrets:
|
||||
- name: harbor-robot-pipeline
|
||||
containers:
|
||||
- name: dind
|
||||
image: docker:27-dind
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: DOCKER_TLS_CERTDIR
|
||||
value: ""
|
||||
args:
|
||||
- --mtu=1400
|
||||
- --host=unix:///var/run/docker.sock
|
||||
- --host=tcp://0.0.0.0:2375
|
||||
volumeMounts:
|
||||
- name: dind-storage
|
||||
mountPath: /var/lib/docker
|
||||
- name: builder
|
||||
image: docker:27
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
env:
|
||||
- name: DOCKER_HOST
|
||||
value: tcp://localhost:2375
|
||||
- name: DOCKER_TLS_CERTDIR
|
||||
value: ""
|
||||
- name: DOCKER_CONFIG
|
||||
value: /root/.docker
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /home/jenkins/agent
|
||||
- name: docker-config-writable
|
||||
mountPath: /root/.docker
|
||||
- name: harbor-config
|
||||
mountPath: /docker-config
|
||||
- name: tester
|
||||
image: golang:1.22-bookworm
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /home/jenkins/agent
|
||||
- name: publisher
|
||||
image: python:3.12-slim
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /home/jenkins/agent
|
||||
volumes:
|
||||
- name: workspace-volume
|
||||
emptyDir: {}
|
||||
- name: docker-config-writable
|
||||
emptyDir: {}
|
||||
- name: dind-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: jenkins-dind-cache
|
||||
- name: harbor-config
|
||||
secret:
|
||||
secretName: harbor-robot-pipeline
|
||||
items:
|
||||
- key: .dockerconfigjson
|
||||
path: config.json
|
||||
"""
|
||||
}
|
||||
}
|
||||
environment {
|
||||
REGISTRY = 'registry.bstein.dev/bstein'
|
||||
IMAGE = "${REGISTRY}/metis"
|
||||
SENTINEL_IMAGE = "${REGISTRY}/metis-sentinel"
|
||||
VERSION_TAG = 'dev'
|
||||
SEMVER = 'dev'
|
||||
COVERAGE_JSON = 'build/coverage.json'
|
||||
JUNIT_XML = 'build/junit.xml'
|
||||
METRICS_PREFIX = 'ariadne_ci'
|
||||
VM_IMPORT_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428/api/v1/import/prometheus'
|
||||
REPO_NAME = 'metis'
|
||||
}
|
||||
options {
|
||||
disableConcurrentBuilds()
|
||||
}
|
||||
triggers {
|
||||
pollSCM('H/5 * * * *')
|
||||
}
|
||||
stages {
|
||||
stage('Checkout') {
|
||||
steps {
|
||||
checkout scm
|
||||
}
|
||||
}
|
||||
|
||||
stage('Unit tests') {
|
||||
steps {
|
||||
container('tester') {
|
||||
sh '''
|
||||
bash -lc '
|
||||
set -euo pipefail
|
||||
apt-get update >/dev/null
|
||||
apt-get install -y --no-install-recommends xz-utils >/dev/null
|
||||
mkdir -p build
|
||||
go install github.com/jstemmer/go-junit-report/v2@latest
|
||||
set +e
|
||||
go test -coverprofile=build/coverage.out ./... 2>&1 | tee build/test.out
|
||||
test_rc=${PIPESTATUS[0]}
|
||||
set -e
|
||||
/root/go/bin/go-junit-report < build/test.out > "${JUNIT_XML}"
|
||||
coverage="0"
|
||||
if [ -f build/coverage.out ]; then
|
||||
coverage="$(go tool cover -func=build/coverage.out | awk '/^total:/ {gsub("%","",$3); print $3}')"
|
||||
fi
|
||||
export GO_COVERAGE="${coverage}"
|
||||
python3 - <<'"'"'PY'"'"'
|
||||
import json, os
|
||||
coverage = float(os.environ.get("GO_COVERAGE", "0") or "0")
|
||||
with open("build/coverage.json", "w", encoding="utf-8") as handle:
|
||||
json.dump({"summary": {"percent_covered": coverage}}, handle)
|
||||
PY
|
||||
exit ${test_rc}
|
||||
'
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Publish test metrics') {
|
||||
steps {
|
||||
container('publisher') {
|
||||
sh '''
|
||||
set -euo pipefail
|
||||
python scripts/publish_test_metrics.py
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Prep toolchain') {
|
||||
steps {
|
||||
container('builder') {
|
||||
sh '''
|
||||
set -euo pipefail
|
||||
mkdir -p /root/.docker
|
||||
cp /docker-config/config.json /root/.docker/config.json
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Compute version') {
|
||||
steps {
|
||||
container('builder') {
|
||||
script {
|
||||
sh '''
|
||||
set -euo pipefail
|
||||
SEMVER="0.1.0-${BUILD_NUMBER}"
|
||||
echo "SEMVER=${SEMVER}" > build.env
|
||||
'''
|
||||
def props = readProperties file: 'build.env'
|
||||
env.SEMVER = props['SEMVER'] ?: "0.1.0-${env.BUILD_NUMBER}"
|
||||
env.VERSION_TAG = env.SEMVER
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Buildx setup') {
|
||||
steps {
|
||||
container('builder') {
|
||||
sh '''
|
||||
set -euo pipefail
|
||||
for i in $(seq 1 10); do
|
||||
if docker info >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
docker buildx use default || docker buildx create --name default --driver docker --use
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Build & push image') {
|
||||
steps {
|
||||
container('builder') {
|
||||
sh '''
|
||||
set -euo pipefail
|
||||
VERSION_TAG="$(cut -d= -f2 build.env)"
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag "${IMAGE}:${VERSION_TAG}" \
|
||||
--tag "${IMAGE}:latest" \
|
||||
--target runtime \
|
||||
--push \
|
||||
.
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--tag "${SENTINEL_IMAGE}:${VERSION_TAG}" \
|
||||
--tag "${SENTINEL_IMAGE}:latest" \
|
||||
--target sentinel \
|
||||
--push \
|
||||
.
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
post {
|
||||
always {
|
||||
script {
|
||||
if (fileExists('build/junit.xml')) {
|
||||
try {
|
||||
junit allowEmptyResults: true, testResults: 'build/junit.xml'
|
||||
} catch (Throwable err) {
|
||||
echo "junit step unavailable: ${err.class.simpleName}"
|
||||
}
|
||||
}
|
||||
}
|
||||
archiveArtifacts artifacts: 'build/junit.xml,build/coverage.json,build/coverage.out', allowEmptyArchive: true, fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
32
README.md
32
README.md
@ -26,3 +26,35 @@ Metis produces fully configured recovery SD cards for any node in the lab (RPi 4
|
||||
- `pkg/` – shared lib (inventory, imaging, injectors, platform abstraction)
|
||||
- `docs/` – user/operator docs (this will stay light; working notes live in AGENTS.md untracked)
|
||||
- `AGENTS.md` – local, untracked working notes (do not add to git)
|
||||
|
||||
## Current modes
|
||||
- `metis plan --inventory inv.yaml --node titan-13 --device /dev/sdz --cache /tmp/metis-cache` prints the burn plan (respects `--boot/--root` or `METIS_*` envs for injection steps).
|
||||
- `metis burn ... --yes` downloads/verifies the golden image, writes it (dd for `/dev/*`, file copy otherwise), and injects node config when mounts are provided.
|
||||
- Pass `--boot /mnt/boot --root /mnt/root` (or set `METIS_BOOT_PATH`/`METIS_ROOT_PATH`) to drop hostname, k3s config, ssh keys, NoCloud user-data, and a debug `etc/metis/node.json` into the mounted card. If unset, injection is skipped (write-only).
|
||||
- `--auto-mount` attempts to mount `/dev/*` partitions (or loop images) automatically for injection on Linux (requires privileges).
|
||||
- `metis image --inventory inv.yaml --node titan-13 --output artifacts/titan-13.img` produces a fully injected raw image artifact without writing to removable media.
|
||||
- `metis serve` runs the operator-facing Metis service:
|
||||
- web UI for build/flash workflows
|
||||
- Prometheus metrics on `/metrics`
|
||||
- internal sentinel snapshot + watch endpoints
|
||||
- Container images are split for gentler cluster operation:
|
||||
- `metis` carries the flash/build toolchain and is intended to run on `titan-22`
|
||||
- `metis-sentinel` stays slim for the DaemonSet that samples node facts
|
||||
- Class overlays: define `boot_overlay`/`root_overlay` on a class to merge static files into boot/root at burn time (e.g., cloud-init/netplan drop-ins, GPU driver configs). Per-node config still injects hostname/IP/k3s/SSH/Longhorn.
|
||||
- Linux loop-mount helper (losetup/mount) exists for automation; wiring into CLI burn is next. Windows writer/GUI stub forthcoming.
|
||||
- Vault: Metis can read per-node secrets from `secret/data/nodes/<hostname>` using VAULT_ADDR plus either VAULT_TOKEN or AppRole (VAULT_ROLE_ID/VAULT_SECRET_ID). Expected fields: ssh_password, k3s_token, cloud_init, extra map.
|
||||
- Sentinel: `metis-sentinel` collects host facts and can either print them, write local history, or push them into the Metis service. The intended deployment shape is a DaemonSet on cluster nodes plus an Ariadne-triggered Metis watch that recomputes recommended class targets and drift history.
|
||||
- Facts aggregation: `metis facts --inventory inv.yaml --snapshots ./snapshots` reads sentinel snapshot JSON files and prints per-class drift summary (kernels, containerd, k3s, package samples). Use exported ConfigMaps or `METIS_SENTINEL_OUT` history as input.
|
||||
- `metis config --inventory inv.yaml --node titan-13` prints the merged node config (hostname/IP/k3s labels/taints/Longhorn UUIDs).
|
||||
|
||||
## Service direction
|
||||
- Deployed UI protected by Atlas SSO headers (`admin` / `maintainer`)
|
||||
- Default flash host support for `titan-22`
|
||||
- Recent build / flash / sentinel change history
|
||||
- Ariadne-driven sentinel watch cadence
|
||||
- Prometheus/Grafana visibility for Metis runs and tests
|
||||
- CI test metrics share the `ariadne_ci_*` series and are distinguished by `repo="metis"`
|
||||
|
||||
Current deployment note: the service can fetch and verify the rpi4 base image from an official URL via `METIS_IMAGE_RPI4_ARMBIAN_LONGHORN` and `METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256`, then cache it locally on the flash host. A mirrored Harbor-backed base image is still preferable long term, but it is no longer a prerequisite for Texas-side builds.
|
||||
|
||||
Next steps: publish the service images, add the SCM remote/repo for Metis, and broaden inventory coverage beyond the current Titan recovery classes.
|
||||
|
||||
89
cmd/metis-sentinel/main.go
Normal file
89
cmd/metis-sentinel/main.go
Normal file
@ -0,0 +1,89 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"metis/pkg/sentinel"
|
||||
)
|
||||
|
||||
func main() {
|
||||
interval := time.Duration(getenvInt("METIS_SENTINEL_INTERVAL_SEC", 300)) * time.Second
|
||||
pushURL := os.Getenv("METIS_SENTINEL_PUSH_URL")
|
||||
runOnce := os.Getenv("METIS_SENTINEL_RUN_ONCE") == "1"
|
||||
|
||||
for {
|
||||
snap := sentinel.Collect()
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(snap); err != nil {
|
||||
log.Fatalf("encode: %v", err)
|
||||
}
|
||||
if out := os.Getenv("METIS_SENTINEL_OUT"); out != "" {
|
||||
writeHistory(out, snap)
|
||||
}
|
||||
if pushURL != "" {
|
||||
if err := pushSnapshot(pushURL, snap); err != nil {
|
||||
log.Printf("push snapshot failed: %v", err)
|
||||
}
|
||||
}
|
||||
if runOnce || pushURL == "" {
|
||||
break
|
||||
}
|
||||
time.Sleep(interval)
|
||||
}
|
||||
}
|
||||
|
||||
func writeHistory(path string, snap *sentinel.Snapshot) {
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||
return
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102T150405Z")
|
||||
b, _ := json.MarshalIndent(snap, "", " ")
|
||||
_ = os.WriteFile(filepath.Join(path, "snapshot-"+ts+".json"), b, 0o644)
|
||||
}
|
||||
|
||||
func pushSnapshot(url string, snap *sentinel.Snapshot) error {
|
||||
payload := map[string]any{
|
||||
"node": snap.Hostname,
|
||||
"collected_at": time.Now().UTC(),
|
||||
"snapshot": snap,
|
||||
}
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("push snapshot: %s", resp.Status)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getenvInt(key string, fallback int) int {
|
||||
if raw := os.Getenv(key); raw != "" {
|
||||
if value, err := strconv.Atoi(raw); err == nil {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
33
cmd/metis/facts_cmd.go
Normal file
33
cmd/metis/facts_cmd.go
Normal file
@ -0,0 +1,33 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"metis/pkg/facts"
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
func factsCmd(args []string) {
|
||||
fs := flag.NewFlagSet("facts", flag.ExitOnError)
|
||||
invPath := fs.String("inventory", "inventory.yaml", "inventory file")
|
||||
dir := fs.String("snapshots", "snapshots", "directory of sentinel snapshot json files")
|
||||
fs.Parse(args)
|
||||
inv, err := inventory.Load(*invPath)
|
||||
if err != nil {
|
||||
log.Fatalf("load inventory: %v", err)
|
||||
}
|
||||
snaps, err := facts.LoadDir(*dir)
|
||||
if err != nil {
|
||||
log.Fatalf("load snapshots: %v", err)
|
||||
}
|
||||
sum := facts.Aggregate(inv, snaps)
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(sum); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "encode: %v\n", err)
|
||||
}
|
||||
}
|
||||
38
cmd/metis/image_cmd.go
Normal file
38
cmd/metis/image_cmd.go
Normal file
@ -0,0 +1,38 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"metis/pkg/plan"
|
||||
)
|
||||
|
||||
func imageCmd(args []string) {
|
||||
fs := flag.NewFlagSet("image", flag.ExitOnError)
|
||||
invPath := fs.String("inventory", "inventory.yaml", "inventory file")
|
||||
node := fs.String("node", "", "target node")
|
||||
output := fs.String("output", "", "output raw image path")
|
||||
cache := fs.String("cache", filepath.Join(os.TempDir(), "metis-cache"), "image cache dir")
|
||||
fs.Parse(args)
|
||||
if *node == "" {
|
||||
log.Fatalf("--node is required")
|
||||
}
|
||||
|
||||
inv := loadInventory(*invPath)
|
||||
targetOutput := *output
|
||||
if targetOutput == "" {
|
||||
targetOutput = filepath.Join("artifacts", fmt.Sprintf("%s.img", *node))
|
||||
}
|
||||
|
||||
if err := plan.BuildImageFile(context.Background(), inv, *node, *cache, targetOutput); err != nil {
|
||||
log.Fatalf("build image: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Wrote %s\n", targetOutput)
|
||||
fmt.Println("Injected rootfs recovery config and overlays.")
|
||||
fmt.Println("Boot-partition NoCloud files are intentionally skipped for this Armbian rpi4 recovery flow.")
|
||||
}
|
||||
27
cmd/metis/inject_cmd.go
Normal file
27
cmd/metis/inject_cmd.go
Normal file
@ -0,0 +1,27 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
|
||||
"metis/pkg/plan"
|
||||
)
|
||||
|
||||
func injectCmd(args []string) {
|
||||
fs := flag.NewFlagSet("inject", flag.ExitOnError)
|
||||
invPath := fs.String("inventory", "inventory.yaml", "inventory file")
|
||||
node := fs.String("node", "", "target node")
|
||||
boot := fs.String("boot", "", "mounted boot path")
|
||||
root := fs.String("root", "", "mounted root path")
|
||||
fs.Parse(args)
|
||||
if *node == "" {
|
||||
log.Fatalf("--node is required")
|
||||
}
|
||||
if *boot == "" && *root == "" {
|
||||
log.Fatalf("--boot or --root is required")
|
||||
}
|
||||
inv := loadInventory(*invPath)
|
||||
if err := plan.Inject(inv, *node, *boot, *root); err != nil {
|
||||
log.Fatalf("inject: %v", err)
|
||||
}
|
||||
}
|
||||
@ -22,8 +22,16 @@ func main() {
|
||||
planCmd(os.Args[2:])
|
||||
case "burn":
|
||||
burnCmd(os.Args[2:])
|
||||
case "image":
|
||||
imageCmd(os.Args[2:])
|
||||
case "serve":
|
||||
serveCmd(os.Args[2:])
|
||||
case "inject":
|
||||
injectCmd(os.Args[2:])
|
||||
case "config":
|
||||
configCmd(os.Args[2:])
|
||||
case "facts":
|
||||
factsCmd(os.Args[2:])
|
||||
default:
|
||||
usage()
|
||||
os.Exit(1)
|
||||
@ -31,7 +39,7 @@ func main() {
|
||||
}
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: metis <plan|burn> [options]\n")
|
||||
fmt.Fprintf(os.Stderr, "Usage: metis <plan|burn|image|serve|inject|config|facts> [options]\n")
|
||||
}
|
||||
|
||||
func loadInventory(path string) *inventory.Inventory {
|
||||
@ -48,11 +56,19 @@ func planCmd(args []string) {
|
||||
node := fs.String("node", "", "target node")
|
||||
device := fs.String("device", "/dev/sdX", "target block device")
|
||||
cache := fs.String("cache", filepath.Join(os.TempDir(), "metis-cache"), "image cache dir")
|
||||
boot := fs.String("boot", "", "mounted boot path for injection (optional)")
|
||||
root := fs.String("root", "", "mounted root path for injection (optional)")
|
||||
fs.Parse(args)
|
||||
if *node == "" {
|
||||
log.Fatalf("--node is required")
|
||||
}
|
||||
inv := loadInventory(*invPath)
|
||||
if *boot != "" {
|
||||
os.Setenv("METIS_BOOT_PATH", *boot)
|
||||
}
|
||||
if *root != "" {
|
||||
os.Setenv("METIS_ROOT_PATH", *root)
|
||||
}
|
||||
p, err := plan.Build(inv, *node, *device, *cache)
|
||||
if err != nil {
|
||||
log.Fatalf("build plan: %v", err)
|
||||
@ -68,12 +84,24 @@ func burnCmd(args []string) {
|
||||
node := fs.String("node", "", "target node")
|
||||
device := fs.String("device", "", "target block device (e.g. /dev/sdX)")
|
||||
cache := fs.String("cache", filepath.Join(os.TempDir(), "metis-cache"), "image cache dir")
|
||||
boot := fs.String("boot", "", "mounted boot path for injection (optional)")
|
||||
root := fs.String("root", "", "mounted root path for injection (optional)")
|
||||
autoMount := fs.Bool("auto-mount", false, "auto-mount boot/root for injection (linux, requires privileges)")
|
||||
confirm := fs.Bool("yes", false, "actually write to device")
|
||||
fs.Parse(args)
|
||||
if *node == "" || *device == "" {
|
||||
log.Fatalf("--node and --device are required")
|
||||
}
|
||||
inv := loadInventory(*invPath)
|
||||
if *boot != "" {
|
||||
os.Setenv("METIS_BOOT_PATH", *boot)
|
||||
}
|
||||
if *root != "" {
|
||||
os.Setenv("METIS_ROOT_PATH", *root)
|
||||
}
|
||||
if *autoMount {
|
||||
os.Setenv("METIS_AUTO_MOUNT", "1")
|
||||
}
|
||||
p, err := plan.Execute(inv, *node, *device, *cache, *confirm)
|
||||
if err != nil {
|
||||
log.Fatalf("burn: %v", err)
|
||||
|
||||
28
cmd/metis/serve_cmd.go
Normal file
28
cmd/metis/serve_cmd.go
Normal file
@ -0,0 +1,28 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
"metis/pkg/service"
|
||||
)
|
||||
|
||||
func serveCmd(args []string) {
|
||||
fs := flag.NewFlagSet("serve", flag.ExitOnError)
|
||||
bindAddr := fs.String("bind", "", "override bind address")
|
||||
fs.Parse(args)
|
||||
|
||||
settings := service.FromEnv()
|
||||
if *bindAddr != "" {
|
||||
settings.BindAddr = *bindAddr
|
||||
}
|
||||
app, err := service.NewApp(settings)
|
||||
if err != nil {
|
||||
log.Fatalf("init service: %v", err)
|
||||
}
|
||||
log.Printf("metis listening on %s", settings.BindAddr)
|
||||
if err := http.ListenAndServe(settings.BindAddr, app.Handler()); err != nil {
|
||||
log.Fatalf("serve: %v", err)
|
||||
}
|
||||
}
|
||||
89
docs/titan-rpi4-recovery.md
Normal file
89
docs/titan-rpi4-recovery.md
Normal file
@ -0,0 +1,89 @@
|
||||
# Titan rpi4 Longhorn Recovery
|
||||
|
||||
This flow is for `titan-13`, `titan-15`, `titan-17`, and `titan-19`.
|
||||
|
||||
## Why this works
|
||||
|
||||
- The replacement card is burned from a plain Armbian rpi4 image.
|
||||
- Metis injects the original node identity, k3s config, SSH key, and Longhorn disk UUIDs.
|
||||
- The image also carries a static NetworkManager profile for the node IP plus local `k3s` and `open-iscsi` payloads sourced from a healthy rpi4 Longhorn node.
|
||||
- An Armbian first-boot hook finishes the host bootstrap automatically:
|
||||
- enables SSH on port `2277`
|
||||
- mounts `/mnt/astreae` and `/mnt/asteria`
|
||||
- ensures the iSCSI initiator identity exists
|
||||
- starts `open-iscsi`
|
||||
- starts `k3s-agent`
|
||||
- For this Armbian flow, the important recovery files live on the root partition; boot NoCloud files are optional and not required for node recovery.
|
||||
|
||||
## Before burning
|
||||
|
||||
For a same-name replacement, remove the old node object first so k3s can re-register the node cleanly.
|
||||
|
||||
```bash
|
||||
kubectl delete node titan-13
|
||||
kubectl delete node titan-19
|
||||
```
|
||||
|
||||
Then export the live cluster join token:
|
||||
|
||||
```bash
|
||||
export METIS_K3S_TOKEN="$(ssh titan-0a 'sudo cat /var/lib/rancher/k3s/server/node-token')"
|
||||
export METIS_IMAGE_RPI4_ARMBIAN_LONGHORN="file://${HOME}/Downloads/Armbian_25.8.1_Rpi4b_noble_current_6.12.41.img"
|
||||
```
|
||||
|
||||
## Burn commands
|
||||
|
||||
Inspect the merged config first:
|
||||
|
||||
```bash
|
||||
go run ./cmd/metis config --inventory inventory.titan-rpi4.yaml --node titan-13
|
||||
go run ./cmd/metis config --inventory inventory.titan-rpi4.yaml --node titan-19
|
||||
```
|
||||
|
||||
If you want ready-to-flash artifacts before inserting SD cards, build them first:
|
||||
|
||||
```bash
|
||||
go run ./cmd/metis image \
|
||||
--inventory inventory.titan-rpi4.yaml \
|
||||
--node titan-13 \
|
||||
--cache "${HOME}/.cache/metis" \
|
||||
--output artifacts/titan-13.img
|
||||
|
||||
go run ./cmd/metis image \
|
||||
--inventory inventory.titan-rpi4.yaml \
|
||||
--node titan-19 \
|
||||
--cache "${HOME}/.cache/metis" \
|
||||
--output artifacts/titan-19.img
|
||||
```
|
||||
|
||||
Burn the cards:
|
||||
|
||||
```bash
|
||||
sudo -E go run ./cmd/metis burn \
|
||||
--inventory inventory.titan-rpi4.yaml \
|
||||
--node titan-13 \
|
||||
--device /dev/sdX \
|
||||
--cache "${HOME}/.cache/metis" \
|
||||
--auto-mount \
|
||||
--yes
|
||||
|
||||
sudo -E go run ./cmd/metis burn \
|
||||
--inventory inventory.titan-rpi4.yaml \
|
||||
--node titan-19 \
|
||||
--device /dev/sdY \
|
||||
--cache "${HOME}/.cache/metis" \
|
||||
--auto-mount \
|
||||
--yes
|
||||
```
|
||||
|
||||
## After boot
|
||||
|
||||
Because the hardware stays the same, the Pi should keep the same MAC address and reclaim the same DHCP reservation.
|
||||
|
||||
Validate:
|
||||
|
||||
```bash
|
||||
kubectl get nodes | grep 'titan-13\|titan-19'
|
||||
kubectl -n longhorn-system get nodes.longhorn.io
|
||||
kubectl -n longhorn-system get replicas.longhorn.io -o wide | grep 'titan-13\|titan-19'
|
||||
```
|
||||
113
docs/titan-rpi4-remote-replacement.md
Normal file
113
docs/titan-rpi4-remote-replacement.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Titan rpi4 Remote Replacement
|
||||
|
||||
This is the low-touch replacement flow for `titan-13` and `titan-19` when the
|
||||
person onsite can only:
|
||||
|
||||
1. insert an SD card into the flashing machine
|
||||
2. swap the card into the Pi
|
||||
3. power-cycle the Pi
|
||||
|
||||
The remote operator does everything else.
|
||||
|
||||
## What the image does by itself
|
||||
|
||||
After the stale Kubernetes node object is deleted and the replacement image is
|
||||
flashed, the booted Pi is expected to do the rest automatically:
|
||||
|
||||
- bring up SSH on port `2277`
|
||||
- set the node hostname
|
||||
- bring up the node's static `192.168.22.x` address on `end0`
|
||||
- mount `/mnt/astreae` and `/mnt/asteria`
|
||||
- start `open-iscsi`
|
||||
- start `k3s-agent`
|
||||
- rejoin the cluster with the baked-in node token and server URL
|
||||
|
||||
## Version clarification
|
||||
|
||||
As of **March 31, 2026**, the live cluster reports:
|
||||
|
||||
- control plane: `k3s v1.33.3+k3s1`
|
||||
- healthy rpi4 Longhorn workers (`titan-15`, `titan-17`): `k3s v1.31.5+k3s1`
|
||||
|
||||
The `6.6.63` and `6.12.41` numbers are Linux kernel versions, not Kubernetes
|
||||
versions.
|
||||
|
||||
Kubernetes' official version skew policy says a `kubelet` may be up to three
|
||||
minor versions older than the `kube-apiserver`, so `1.31` workers against a
|
||||
`1.33` control plane are supported today:
|
||||
|
||||
- https://kubernetes.io/releases/version-skew-policy/
|
||||
|
||||
The replacement images intentionally keep the rpi4 worker `k3s` version aligned
|
||||
with the healthy HDD-backed rpi4 workers to avoid introducing a Kubernetes minor
|
||||
change during node recovery.
|
||||
|
||||
## Remote flashing flow
|
||||
|
||||
Run these commands from the machine that has the `metis` repo and your SSH
|
||||
access.
|
||||
|
||||
### 1. Build the image and delete the stale node object
|
||||
|
||||
```bash
|
||||
cd ~/Development/metis
|
||||
./scripts/prepare_titan_rpi4_replacement.sh titan-13 titan-22
|
||||
./scripts/prepare_titan_rpi4_replacement.sh titan-19 titan-22
|
||||
```
|
||||
|
||||
This does all of the following:
|
||||
|
||||
- fetches the current cluster node token from `titan-0a`
|
||||
- deletes the stale Kubernetes `Node` object
|
||||
- builds the replacement image under `artifacts/`
|
||||
- copies it to `titan-22:/tmp/metis-images/`
|
||||
|
||||
### 2. Ask the onsite helper to insert the SD card into `titan-22`
|
||||
|
||||
When the card is inserted, identify the target device:
|
||||
|
||||
```bash
|
||||
./scripts/remote_sd_candidates.sh titan-22
|
||||
```
|
||||
|
||||
### 3. Flash the card remotely
|
||||
|
||||
```bash
|
||||
./scripts/remote_flash_titan_image.sh titan-22 titan-13 /dev/sdX
|
||||
./scripts/remote_flash_titan_image.sh titan-22 titan-19 /dev/sdY
|
||||
```
|
||||
|
||||
The remote machine will ask for its `sudo` password during the flash.
|
||||
|
||||
### 4. Ask the onsite helper to swap the card and power-cycle the Pi
|
||||
|
||||
That should be the end of the onsite work.
|
||||
|
||||
### 5. Validate remotely
|
||||
|
||||
```bash
|
||||
kubectl get nodes -w
|
||||
kubectl -n longhorn-system get nodes.longhorn.io
|
||||
kubectl -n longhorn-system get replicas.longhorn.io -o wide | grep 'titan-13\|titan-19'
|
||||
ssh titan-13
|
||||
ssh titan-19
|
||||
```
|
||||
|
||||
## USB boot
|
||||
|
||||
Raspberry Pi 4 supports USB mass storage boot via its EEPROM bootloader:
|
||||
|
||||
- https://www.raspberrypi.com/documentation/computers/raspberry-pi.html#usb-mass-storage-boot
|
||||
|
||||
That means the same general recovery image approach can be used on a USB device
|
||||
instead of an SD card.
|
||||
|
||||
For this cluster, the safer rollout is:
|
||||
|
||||
1. first recover `titan-13` and `titan-19` to known-good SD cards
|
||||
2. pilot USB boot on one non-critical rpi4
|
||||
3. only then migrate the Longhorn HDD-backed rpi4s
|
||||
|
||||
USB boot is attractive for wear reduction, but it adds EEPROM boot-order,
|
||||
adapter, and power-delivery variables. The emergency replacement process above
|
||||
should stay SD-based until the USB path has been tested on your actual hardware.
|
||||
@ -5,6 +5,8 @@ classes:
|
||||
os: ubuntu-24.04
|
||||
image: https://harbor.bstein.dev/library/rpi5-ubuntu-worker.img
|
||||
checksum: sha256:REPLACE_ME
|
||||
boot_overlay: overlays/rpi5-boot
|
||||
root_overlay: overlays/rpi5-root
|
||||
default_labels:
|
||||
hardware: rpi5
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
@ -14,6 +16,8 @@ classes:
|
||||
os: armbian-6.6
|
||||
image: https://harbor.bstein.dev/library/rpi4-armbian-longhorn.img
|
||||
checksum: sha256:REPLACE_ME
|
||||
boot_overlay: overlays/rpi4-boot
|
||||
root_overlay: overlays/rpi4-root
|
||||
default_labels:
|
||||
hardware: rpi4
|
||||
longhorn: "true"
|
||||
@ -24,10 +28,24 @@ classes:
|
||||
os: ubuntu-24.04
|
||||
image: https://harbor.bstein.dev/library/rpi5-ubuntu-control.img
|
||||
checksum: sha256:REPLACE_ME
|
||||
boot_overlay: overlays/cp-boot
|
||||
root_overlay: overlays/cp-root
|
||||
default_labels:
|
||||
node-role.kubernetes.io/control-plane: "true"
|
||||
default_taints:
|
||||
- node-role.kubernetes.io/control-plane:NoSchedule
|
||||
- name: jetson-accelerator
|
||||
arch: arm64
|
||||
os: ubuntu-20.04-tegra
|
||||
image: https://harbor.bstein.dev/library/jetson-accelerator.img
|
||||
checksum: sha256:REPLACE_ME
|
||||
boot_overlay: overlays/jetson-boot
|
||||
root_overlay: overlays/jetson-root
|
||||
default_labels:
|
||||
accelerator: nvidia
|
||||
jetson: "true"
|
||||
node-role.kubernetes.io/accelerator: ""
|
||||
default_taints: []
|
||||
|
||||
nodes:
|
||||
- name: titan-04
|
||||
@ -54,3 +72,21 @@ nodes:
|
||||
uuid: cbd4989d-62b5-4741-8b2a-28fdae259cae
|
||||
fs: ext4
|
||||
ssh_user: root
|
||||
- name: titan-20
|
||||
class: jetson-accelerator
|
||||
hostname: titan-20
|
||||
ip: 192.168.22.20
|
||||
k3s_role: agent
|
||||
labels:
|
||||
accelerator: nvidia
|
||||
jetson: "true"
|
||||
ssh_user: ubuntu
|
||||
- name: titan-21
|
||||
class: jetson-accelerator
|
||||
hostname: titan-21
|
||||
ip: 192.168.22.21
|
||||
k3s_role: agent
|
||||
labels:
|
||||
accelerator: nvidia
|
||||
jetson: "true"
|
||||
ssh_user: ubuntu
|
||||
|
||||
81
inventory.titan-rpi4.yaml
Normal file
81
inventory.titan-rpi4.yaml
Normal file
@ -0,0 +1,81 @@
|
||||
classes:
|
||||
- name: rpi4-armbian-longhorn
|
||||
arch: arm64
|
||||
os: armbian-noble
|
||||
image: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN}
|
||||
checksum: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256}
|
||||
k3s_version: v1.31.5+k3s1
|
||||
default_labels:
|
||||
hardware: rpi4
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
root_overlay: overlays/rpi4-armbian-longhorn-root
|
||||
|
||||
nodes:
|
||||
- name: titan-13
|
||||
class: rpi4-armbian-longhorn
|
||||
hostname: titan-13
|
||||
ip: 192.168.22.41
|
||||
k3s_role: agent
|
||||
k3s_url: https://192.168.22.7:6443
|
||||
k3s_token: ${METIS_K3S_TOKEN}
|
||||
ssh_user: atlas
|
||||
ssh_authorized_keys:
|
||||
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion
|
||||
longhorn_disks:
|
||||
- mountpoint: /mnt/astreae
|
||||
uuid: 6031fa8b-f28c-45c3-b7bc-6133300e07c6
|
||||
fs: ext4
|
||||
- mountpoint: /mnt/asteria
|
||||
uuid: cbd4989d-62b5-4741-8b2a-28fdae259cae
|
||||
fs: ext4
|
||||
- name: titan-15
|
||||
class: rpi4-armbian-longhorn
|
||||
hostname: titan-15
|
||||
ip: 192.168.22.43
|
||||
k3s_role: agent
|
||||
k3s_url: https://192.168.22.7:6443
|
||||
k3s_token: ${METIS_K3S_TOKEN}
|
||||
ssh_user: atlas
|
||||
ssh_authorized_keys:
|
||||
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion
|
||||
longhorn_disks:
|
||||
- mountpoint: /mnt/astreae
|
||||
uuid: f3362f14-5822-449f-944b-ac570b5cd615
|
||||
fs: ext4
|
||||
- mountpoint: /mnt/asteria
|
||||
uuid: 9c5316e6-f847-4884-b502-11f2d0d15d6f
|
||||
fs: ext4
|
||||
- name: titan-17
|
||||
class: rpi4-armbian-longhorn
|
||||
hostname: titan-17
|
||||
ip: 192.168.22.45
|
||||
k3s_role: agent
|
||||
k3s_url: https://192.168.22.7:6443
|
||||
k3s_token: ${METIS_K3S_TOKEN}
|
||||
ssh_user: atlas
|
||||
ssh_authorized_keys:
|
||||
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion
|
||||
longhorn_disks:
|
||||
- mountpoint: /mnt/astreae
|
||||
uuid: 1fecdade-08b0-49cb-9ae3-be6c188b0a96
|
||||
fs: ext4
|
||||
- mountpoint: /mnt/asteria
|
||||
uuid: 2fe9f613-d372-47ca-b84f-82084e4edda0
|
||||
fs: ext4
|
||||
- name: titan-19
|
||||
class: rpi4-armbian-longhorn
|
||||
hostname: titan-19
|
||||
ip: 192.168.22.47
|
||||
k3s_role: agent
|
||||
k3s_url: https://192.168.22.7:6443
|
||||
k3s_token: ${METIS_K3S_TOKEN}
|
||||
ssh_user: atlas
|
||||
ssh_authorized_keys:
|
||||
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion
|
||||
longhorn_disks:
|
||||
- mountpoint: /mnt/astreae
|
||||
uuid: 4890abb9-dda2-4f4f-9c0f-081ee82849cf
|
||||
fs: ext4
|
||||
- mountpoint: /mnt/asteria
|
||||
uuid: 2b4ea28d-b0e6-4fa3-841b-cd7067ae9153
|
||||
fs: ext4
|
||||
67
overlays/rpi4-armbian-longhorn-root/etc/default/open-iscsi
Normal file
67
overlays/rpi4-armbian-longhorn-root/etc/default/open-iscsi
Normal file
@ -0,0 +1,67 @@
|
||||
# List of LVMed iSCSI Volume Groups.
|
||||
# Multiple Volume Groups can be specified with spaces
|
||||
#
|
||||
# This list defines the Volume Groups that should be activated at boot
|
||||
# after iSCSI has been activated. If you use dynamic activation of LVM
|
||||
# volumes (lvmetad), you can (and should) leave this empty.
|
||||
#
|
||||
# On shutdown, this setting typically has no effect, since open-iscsi
|
||||
# tries to determine all active VGs on iSCSI and deactivate them.
|
||||
# However, if you have a really complicated stacking setup that isn't
|
||||
# automatically detected, volume groups defined here will also be
|
||||
# deactivated.
|
||||
#
|
||||
# To see whether open-iscsi is able to properly detect your setup for
|
||||
# shutdown, execute the following on a running system:
|
||||
# /lib/open-iscsi/umountiscsi.sh --dry-run
|
||||
# This will tell you what steps will betaken at shutdown before logging
|
||||
# out of the iSCSI session.
|
||||
LVMGROUPS=""
|
||||
|
||||
|
||||
# Handle _netdev devices
|
||||
# You can specify your iSCSI (LVMed or Multipathed or DM Encrypted)
|
||||
# devices with the _netdev mount option and open-iscsi will treat them
|
||||
# accordingly.
|
||||
#
|
||||
# Note: however, handling _netdev devices comes with the caveat that
|
||||
# other _netdev mounts, like an NFS share, also get pulled in with it.
|
||||
#
|
||||
# If this option is set to 0, no iSCSI mounts in /etc/fstab will be
|
||||
# automatically mounted on systems running sysvinit. This setting is
|
||||
# not necessary when using systemd as init system (Debian's default).
|
||||
HANDLE_NETDEV=1
|
||||
|
||||
|
||||
# Additional mounts to exclude at shutdown.
|
||||
#
|
||||
# If you have additional mounts on iSCSI that shouldn't be umounted at
|
||||
# shutdown by open-iscsi (by default, open-iscsi excludes / and on
|
||||
# systemd systems als /usr), place them here. iSCSI sessions that carry
|
||||
# these mounts will also be kept open.
|
||||
#
|
||||
# If any of these mountpoints contain spaces, please use the same
|
||||
# escaping as in /etc/fstab, i.e. replace the spaces with \040.
|
||||
EXCLUDE_MOUNTS_AT_SHUTDOWN=""
|
||||
|
||||
|
||||
|
||||
# Don't logout from ANY iSCSI session on shutdown
|
||||
#
|
||||
# When shutting down, if the root filesystem is on iSCSI, open-iscsi
|
||||
# tries to determine which sessions are still required for the root
|
||||
# filesystem. By default, the host will still logout from all other
|
||||
# sessions.
|
||||
#
|
||||
# If you are running a very complicated setup of your root filesystem
|
||||
# (multiple mapping levels stacked on top of each other), it may be the
|
||||
# case that the autodetection logic doesn't work propery. You may then
|
||||
# enable this setting to keep around all iSCSI sessions.
|
||||
#
|
||||
# Note that /etc/iscsi/iscsi.initramfs must exist for this option to
|
||||
# have any effect at all.
|
||||
#
|
||||
# This was the default behavior in previous versions of this package
|
||||
# up to the version that shipped with Debian 8 (Jessie).
|
||||
#
|
||||
ISCSI_ROOT_KEEP_ALL_SESSIONS_AT_SHUTDOWN=0
|
||||
361
overlays/rpi4-armbian-longhorn-root/etc/iscsi/iscsid.conf
Normal file
361
overlays/rpi4-armbian-longhorn-root/etc/iscsi/iscsid.conf
Normal file
@ -0,0 +1,361 @@
|
||||
#
|
||||
# Open-iSCSI default configuration.
|
||||
#
|
||||
# Note: To set any of these values for a specific node/session run
|
||||
# the iscsiadm --mode node --op command for the value. See the README
|
||||
# and man page for iscsiadm for details on the --op command.
|
||||
#
|
||||
|
||||
######################
|
||||
# iscsid daemon config
|
||||
######################
|
||||
#
|
||||
# If you want iscsid to start the first time an iscsi tool
|
||||
# needs to access it, instead of starting it when the init
|
||||
# scripts run, set the iscsid startup command here. This
|
||||
# should normally only need to be done by distro package
|
||||
# maintainers. If you leave the iscsid daemon running all
|
||||
# the time then leave this attribute commented out.
|
||||
#
|
||||
# Default for Fedora and RHEL. Uncomment to activate.
|
||||
# iscsid.startup = /bin/systemctl start iscsid.socket iscsiuio.socket
|
||||
#
|
||||
# Default for Debian and Ubuntu. Uncomment to activate.
|
||||
iscsid.startup = /bin/systemctl start iscsid.socket
|
||||
#
|
||||
# Default if you are not using systemd. Uncomment to activate.
|
||||
# iscsid.startup = /usr/bin/service start iscsid
|
||||
|
||||
# Check for active mounts on devices reachable through a session
|
||||
# and refuse to logout if there are any. Defaults to "No".
|
||||
# iscsid.safe_logout = Yes
|
||||
|
||||
# Only require UID auth for MGMT IPCs, and not username.
|
||||
# Checking username is a legacy security practice, and is on the path
|
||||
# to deprecation.
|
||||
# Set to "No" for legacy compatibility.
|
||||
# Defaults to "Yes".
|
||||
# iscsid.ipc_auth_uid = No
|
||||
|
||||
#############################
|
||||
# NIC/HBA and driver settings
|
||||
#############################
|
||||
# open-iscsi can create a session and bind it to a NIC/HBA.
|
||||
# To set this up see the example iface config file.
|
||||
|
||||
#*****************
|
||||
# Startup settings
|
||||
#*****************
|
||||
|
||||
# To request that the iscsi service scripts startup a session, use "automatic":
|
||||
# node.startup = automatic
|
||||
#
|
||||
# To manually startup the session, use "manual". The default is manual.
|
||||
node.startup = manual
|
||||
|
||||
# For "automatic" startup nodes, setting this to "Yes" will try logins on each
|
||||
# available iface until one succeeds, and then stop. The default "No" will try
|
||||
# logins on all available ifaces simultaneously.
|
||||
node.leading_login = No
|
||||
|
||||
# *************
|
||||
# CHAP Settings
|
||||
# *************
|
||||
|
||||
# To enable CHAP authentication set node.session.auth.authmethod
|
||||
# to CHAP. The default is None.
|
||||
#node.session.auth.authmethod = CHAP
|
||||
|
||||
# To configure which CHAP algorithms to enable, set
|
||||
# node.session.auth.chap_algs to a comma separated list.
|
||||
# The algorithms should be listed in order of decreasing
|
||||
# preference — in particular, with the most preferred algorithm first.
|
||||
# Valid values are MD5, SHA1, SHA256, and SHA3-256.
|
||||
# The default is MD5.
|
||||
#node.session.auth.chap_algs = SHA3-256,SHA256,SHA1,MD5
|
||||
|
||||
# To set a CHAP username and password for initiator
|
||||
# authentication by the target(s), uncomment the following lines:
|
||||
#node.session.auth.username = username
|
||||
#node.session.auth.password = password
|
||||
|
||||
# To set a CHAP username and password for target(s)
|
||||
# authentication by the initiator, uncomment the following lines:
|
||||
#node.session.auth.username_in = username_in
|
||||
#node.session.auth.password_in = password_in
|
||||
|
||||
# To enable CHAP authentication for a discovery session to the target,
|
||||
# set discovery.sendtargets.auth.authmethod to CHAP. The default is None.
|
||||
#discovery.sendtargets.auth.authmethod = CHAP
|
||||
|
||||
# To set a discovery session CHAP username and password for the initiator
|
||||
# authentication by the target(s), uncomment the following lines:
|
||||
#discovery.sendtargets.auth.username = username
|
||||
#discovery.sendtargets.auth.password = password
|
||||
|
||||
# To set a discovery session CHAP username and password for target(s)
|
||||
# authentication by the initiator, uncomment the following lines:
|
||||
#discovery.sendtargets.auth.username_in = username_in
|
||||
#discovery.sendtargets.auth.password_in = password_in
|
||||
|
||||
# ********
|
||||
# Timeouts
|
||||
# ********
|
||||
#
|
||||
# See the iSCSI README's Advanced Configuration section for tips
|
||||
# on setting timeouts when using multipath or doing root over iSCSI.
|
||||
#
|
||||
# To specify the length of time to wait for session re-establishment
|
||||
# before failing SCSI commands back to the application when running
|
||||
# the Linux SCSI Layer error handler, edit the line.
|
||||
# The value is in seconds and the default is 120 seconds.
|
||||
# Special values:
|
||||
# - If the value is 0, IO will be failed immediately.
|
||||
# - If the value is less than 0, IO will remain queued until the session
|
||||
# is logged back in, or until the user runs the logout command.
|
||||
node.session.timeo.replacement_timeout = 120
|
||||
|
||||
# To specify the time to wait for login to complete, edit the line.
|
||||
# The value is in seconds and the default is 15 seconds.
|
||||
node.conn[0].timeo.login_timeout = 15
|
||||
|
||||
# To specify the time to wait for logout to complete, edit the line.
|
||||
# The value is in seconds and the default is 15 seconds.
|
||||
node.conn[0].timeo.logout_timeout = 15
|
||||
|
||||
# Time interval to wait for on connection before sending a ping.
|
||||
# The value is in seconds and the default is 5 seconds.
|
||||
node.conn[0].timeo.noop_out_interval = 5
|
||||
|
||||
# To specify the time to wait for a Nop-out response before failing
|
||||
# the connection, edit this line. Failing the connection will
|
||||
# cause IO to be failed back to the SCSI layer. If using dm-multipath
|
||||
# this will cause the IO to be failed to the multipath layer.
|
||||
# The value is in seconds and the default is 5 seconds.
|
||||
node.conn[0].timeo.noop_out_timeout = 5
|
||||
|
||||
# To specify the time to wait for an abort response before
|
||||
# failing the operation and trying a logical unit reset, edit the line.
|
||||
# The value is in seconds and the default is 15 seconds.
|
||||
node.session.err_timeo.abort_timeout = 15
|
||||
|
||||
# To specify the time to wait for a logical unit response
|
||||
# before failing the operation and trying session re-establishment,
|
||||
# edit the line.
|
||||
# The value is in seconds and the default is 30 seconds.
|
||||
node.session.err_timeo.lu_reset_timeout = 30
|
||||
|
||||
# To specify the time to wait for a target response
|
||||
# before failing the operation and trying session re-establishment,
|
||||
# edit the line.
|
||||
# The value is in seconds and the default is 30 seconds.
|
||||
node.session.err_timeo.tgt_reset_timeout = 30
|
||||
|
||||
# The value is in seconds and the default is 60 seconds.
|
||||
node.session.err_timeo.host_reset_timeout = 60
|
||||
|
||||
|
||||
#******
|
||||
# Retry
|
||||
#******
|
||||
|
||||
# To specify the number of times iscsid should retry a login
|
||||
# if the login attempt fails due to the node.conn[0].timeo.login_timeout
|
||||
# expiring, modify the following line. Note that if the login fails
|
||||
# quickly (before node.conn[0].timeo.login_timeout fires) because the network
|
||||
# layer or the target returns an error, iscsid may retry the login more than
|
||||
# node.session.initial_login_retry_max times.
|
||||
#
|
||||
# This retry count along with node.conn[0].timeo.login_timeout
|
||||
# determines the maximum amount of time iscsid will try to
|
||||
# establish the initial login. node.session.initial_login_retry_max is
|
||||
# multiplied by the node.conn[0].timeo.login_timeout to determine the
|
||||
# maximum amount.
|
||||
#
|
||||
# The default node.session.initial_login_retry_max is 8 and
|
||||
# node.conn[0].timeo.login_timeout is 15 so we have:
|
||||
#
|
||||
# node.conn[0].timeo.login_timeout * node.session.initial_login_retry_max = 120s
|
||||
#
|
||||
# Valid values are any integer value. This only
|
||||
# affects the initial login. Setting it to a high value can slow
|
||||
# down the iscsi service startup. Setting it to a low value can
|
||||
# cause a session to not get logged into, if there are distuptions
|
||||
# during startup or if the network is not ready at that time.
|
||||
node.session.initial_login_retry_max = 8
|
||||
|
||||
################################
|
||||
# session and device queue depth
|
||||
################################
|
||||
|
||||
# To control how many commands the session will queue, set
|
||||
# node.session.cmds_max to an integer between 2 and 2048 that is also
|
||||
# a power of 2. The default is 128.
|
||||
node.session.cmds_max = 128
|
||||
|
||||
# To control the device's queue depth, set node.session.queue_depth
|
||||
# to a value between 1 and 1024. The default is 32.
|
||||
node.session.queue_depth = 32
|
||||
|
||||
##################################
|
||||
# MISC SYSTEM PERFORMANCE SETTINGS
|
||||
##################################
|
||||
|
||||
# For software iscsi (iscsi_tcp) and iser (ib_iser), each session
|
||||
# has a thread used to transmit or queue data to the hardware. For
|
||||
# cxgb3i, you will get a thread per host.
|
||||
#
|
||||
# Setting the thread's priority to a lower value can lead to higher throughput
|
||||
# and lower latencies. The lowest value is -20. Setting the priority to
|
||||
# a higher value, can lead to reduced IO performance, but if you are seeing
|
||||
# the iscsi or scsi threads dominate the use of the CPU then you may want
|
||||
# to set this value higher.
|
||||
#
|
||||
# Note: For cxgb3i, you must set all sessions to the same value.
|
||||
# Otherwise the behavior is not defined.
|
||||
#
|
||||
# This is done by scanning /proc/PID/stat, and this doesn't work in
|
||||
# newer kernels (6.* on), as the workqueue transmit thread can be
|
||||
# passive, and not show in in the process table when not actively
|
||||
# doing work. If the proper workqueue process is found, and the
|
||||
# priority value is non-zero, then the priority of that process will
|
||||
# be modified when a session is created.
|
||||
#
|
||||
# Note: as mentioned above, the default value is now zero, which means
|
||||
# that we don't do anything to the transmit workqueue process priority,
|
||||
# by default. If you wish to get the previous behavior, set this value
|
||||
# to -20. In the future, this functionality will be removed, once this
|
||||
# functionality is no longer needed or works.
|
||||
#
|
||||
# The default value is 0. The setting must be between -20 and 20.
|
||||
# node.session.xmit_thread_priority = 0
|
||||
|
||||
|
||||
#***************
|
||||
# iSCSI settings
|
||||
#***************
|
||||
|
||||
# To enable R2T flow control (i.e., the initiator must wait for an R2T
|
||||
# command before sending any data), uncomment the following line:
|
||||
#
|
||||
#node.session.iscsi.InitialR2T = Yes
|
||||
#
|
||||
# To disable R2T flow control (i.e., the initiator has an implied
|
||||
# initial R2T of "FirstBurstLength" at offset 0), uncomment the following line:
|
||||
#
|
||||
# The defaults is No.
|
||||
node.session.iscsi.InitialR2T = No
|
||||
|
||||
#
|
||||
# To disable immediate data (i.e., the initiator does not send
|
||||
# unsolicited data with the iSCSI command PDU), uncomment the following line:
|
||||
#
|
||||
#node.session.iscsi.ImmediateData = No
|
||||
#
|
||||
# To enable immediate data (i.e., the initiator sends unsolicited data
|
||||
# with the iSCSI command packet), uncomment the following line:
|
||||
#
|
||||
# The default is Yes.
|
||||
node.session.iscsi.ImmediateData = Yes
|
||||
|
||||
# To specify the maximum number of unsolicited data bytes the initiator
|
||||
# can send in an iSCSI PDU to a target, edit the following line.
|
||||
#
|
||||
# The value is the number of bytes in the range of 512 to (2^24-1) and
|
||||
# the default is 262144.
|
||||
node.session.iscsi.FirstBurstLength = 262144
|
||||
|
||||
# To specify the maximum SCSI payload that the initiator will negotiate
|
||||
# with the target for, edit the following line.
|
||||
#
|
||||
# The value is the number of bytes in the range of 512 to (2^24-1) and
|
||||
# the defauls it 16776192.
|
||||
node.session.iscsi.MaxBurstLength = 16776192
|
||||
|
||||
# To specify the maximum number of data bytes the initiator can receive
|
||||
# in an iSCSI PDU from a target, edit the following line.
|
||||
#
|
||||
# The value is the number of bytes in the range of 512 to (2^24-1) and
|
||||
# the default is 262144.
|
||||
node.conn[0].iscsi.MaxRecvDataSegmentLength = 262144
|
||||
|
||||
# To specify the maximum number of data bytes the initiator will send
|
||||
# in an iSCSI PDU to the target, edit the following line.
|
||||
#
|
||||
# The value is the number of bytes in the range of 512 to (2^24-1).
|
||||
# Zero is a special case. If set to zero, the initiator will use
|
||||
# the target's MaxRecvDataSegmentLength for the MaxXmitDataSegmentLength.
|
||||
# The default is 0.
|
||||
node.conn[0].iscsi.MaxXmitDataSegmentLength = 0
|
||||
|
||||
# To specify the maximum number of data bytes the initiator can receive
|
||||
# in an iSCSI PDU from a target during a discovery session, edit the
|
||||
# following line.
|
||||
#
|
||||
# The value is the number of bytes in the range of 512 to (2^24-1) and
|
||||
# the default is 32768.
|
||||
discovery.sendtargets.iscsi.MaxRecvDataSegmentLength = 32768
|
||||
|
||||
# To allow the targets to control the setting of the digest checking,
|
||||
# with the initiator requesting a preference of enabling the checking,
|
||||
# uncomment one or both of the following lines:
|
||||
#node.conn[0].iscsi.HeaderDigest = CRC32C,None
|
||||
#node.conn[0].iscsi.DataDigest = CRC32C,None
|
||||
#
|
||||
# To allow the targets to control the setting of the digest checking,
|
||||
# with the initiator requesting a preference of disabling the checking,
|
||||
# uncomment one or both of the following lines:
|
||||
#node.conn[0].iscsi.HeaderDigest = None,CRC32C
|
||||
#node.conn[0].iscsi.DataDigest = None,CRC32C
|
||||
#
|
||||
# To enable CRC32C digest checking for the header and/or data part of
|
||||
# iSCSI PDUs, uncomment one or both of the following lines:
|
||||
#node.conn[0].iscsi.HeaderDigest = CRC32C
|
||||
#node.conn[0].iscsi.DataDigest = CRC32C
|
||||
#
|
||||
# To disable digest checking for the header and/or data part of
|
||||
# iSCSI PDUs, uncomment one or both of the following lines:
|
||||
#node.conn[0].iscsi.HeaderDigest = None
|
||||
#node.conn[0].iscsi.DataDigest = None
|
||||
#
|
||||
# The default is to never use DataDigests or HeaderDigests.
|
||||
#
|
||||
|
||||
# For multipath configurations, you may want more than one session to be
|
||||
# created on each iface record. If node.session.nr_sessions is greater
|
||||
# than 1, performing a 'login' for that node will ensure that the
|
||||
# appropriate number of sessions is created.
|
||||
node.session.nr_sessions = 1
|
||||
|
||||
# When iscsid starts up, it recovers existing sessions (if possible).
|
||||
# If the target for a session has gone away when this occurs, the
|
||||
# iscsid daemon normally tries to reestablish each session,
|
||||
# in succession, in the background, by trying again every two
|
||||
# seconds until all sessions are restored. This configuration
|
||||
# variable can limits the number of retries for each session.
|
||||
# For example, setting reopen_max=150 would mean that each session
|
||||
# recovery was limited to about five minutes.
|
||||
node.session.reopen_max = 0
|
||||
|
||||
#************
|
||||
# Workarounds
|
||||
#************
|
||||
|
||||
# Some targets like IET prefer that an initiator does not respond to PDUs like
|
||||
# R2Ts after it has sent a task management function like an ABORT TASK or a
|
||||
# LOGICAL UNIT RESET. To adopt this behavior, uncomment the following line.
|
||||
# The default is Yes.
|
||||
node.session.iscsi.FastAbort = Yes
|
||||
|
||||
# Some targets like Equalogic prefer that an initiator continue to respond to
|
||||
# R2Ts after it has sent a task management function like an ABORT TASK or a
|
||||
# LOGICAL UNIT RESET. To adopt this behavior, uncomment the following line.
|
||||
# node.session.iscsi.FastAbort = No
|
||||
|
||||
# To prevent doing automatic scans that would add unwanted luns to the system,
|
||||
# we can disable them and have sessions only do manually requested scans.
|
||||
# Automatic scans are performed on startup, on login, and on AEN/AER reception
|
||||
# on devices supporting it. For HW drivers, all sessions will use the value
|
||||
# defined in the configuration file. This configuration option is independent
|
||||
# of the scsi_mod.scan parameter. The default is auto.
|
||||
node.session.scan = auto
|
||||
@ -0,0 +1,3 @@
|
||||
Port 2277
|
||||
PasswordAuthentication no
|
||||
PermitRootLogin prohibit-password
|
||||
@ -0,0 +1,2 @@
|
||||
[Service]
|
||||
ExecStartPost=/usr/local/sbin/metis-rpi4-longhorn-firstboot.sh
|
||||
@ -0,0 +1,27 @@
|
||||
[Unit]
|
||||
Description=Lightweight Kubernetes
|
||||
Documentation=https://k3s.io
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
EnvironmentFile=-/etc/default/%N
|
||||
EnvironmentFile=-/etc/sysconfig/%N
|
||||
EnvironmentFile=-/etc/systemd/system/k3s-agent.service.env
|
||||
KillMode=process
|
||||
Delegate=yes
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=infinity
|
||||
LimitCORE=infinity
|
||||
TasksMax=infinity
|
||||
TimeoutStartSec=0
|
||||
Restart=always
|
||||
RestartSec=5s
|
||||
ExecStartPre=/bin/sh -xc '! /usr/bin/systemctl is-enabled --quiet nm-cloud-setup.service 2>/dev/null'
|
||||
ExecStartPre=-/sbin/modprobe br_netfilter
|
||||
ExecStartPre=-/sbin/modprobe overlay
|
||||
ExecStart=/usr/local/bin/k3s agent
|
||||
@ -0,0 +1,4 @@
|
||||
[Service]
|
||||
Environment="K3S_KUBELET_ARG=image-gc-high-threshold=65"
|
||||
Environment="K3S_KUBELET_ARG=image-gc-low-threshold=50"
|
||||
Environment="K3S_KUBELET_ARG=image-gc-minimum-available=8Gi"
|
||||
@ -0,0 +1,3 @@
|
||||
[Service]
|
||||
Environment="K3S_KUBELET_ARG=container-log-max-size=10Mi"
|
||||
Environment="K3S_KUBELET_ARG=container-log-max-files=2"
|
||||
@ -0,0 +1,2 @@
|
||||
[Service]
|
||||
LimitNOFILE=1048576
|
||||
Binary file not shown.
@ -0,0 +1 @@
|
||||
libopeniscsiusr.so.0.2.0
|
||||
Binary file not shown.
80
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/activate-storage.sh
Executable file
80
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/activate-storage.sh
Executable file
@ -0,0 +1,80 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# This script activates storage at boot after the iSCSI login. It can
|
||||
# be called from both the init script as well as the native systemd
|
||||
# service.
|
||||
#
|
||||
|
||||
PATH=/usr/sbin:/sbin:/usr/bin:/bin
|
||||
|
||||
MULTIPATH=/sbin/multipath
|
||||
VGCHANGE=/sbin/vgchange
|
||||
|
||||
if [ -f /etc/default/open-iscsi ]; then
|
||||
. /etc/default/open-iscsi
|
||||
fi
|
||||
|
||||
# See if we need to handle LVM
|
||||
if [ ! -x $VGCHANGE ] && [ -n "$LVMGROUPS" ]; then
|
||||
echo "Warning: LVM2 tools are not installed, not honouring LVMGROUPS." >&2
|
||||
LVMGROUPS=""
|
||||
fi
|
||||
|
||||
# If we don't have to activate any VGs and are running systemd, we
|
||||
# don't have to activate anything, so doing udevadm settle here and
|
||||
# potentially sleeping (if multipath is used) will not be productive,
|
||||
# because after waiting for both of these things, we will do nothing.
|
||||
# Therefore just drop out early if that is the case.
|
||||
if [ -d /run/systemd/system ] && [ -z "$LVMGROUPS" ] ; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Make sure we pick up all devices
|
||||
udevadm settle || true
|
||||
|
||||
# Work around race condition here: after udevadm settle it is
|
||||
# guaranteed that all iSCSI disks have now properly appeared, but
|
||||
# other dependent devices may not have. This can include multipath
|
||||
# mappings of iSCSI devices (multipathd will race against udev for
|
||||
# locking the underlying source block devices when it comes to
|
||||
# creating the mappings, and it will retry the lock only once per
|
||||
# second, and typically succeed only on second try), but also
|
||||
# partitions on the given disks (which the kernel scans
|
||||
# asyncronously).
|
||||
#
|
||||
# The proper way of handling this is to have LVM activation and/or
|
||||
# mounting of file systems be handled in a completely event-driven
|
||||
# manner, but that requires configuration by the sysadmin in the
|
||||
# case of LVM, and for mounting it only works with systemd at the
|
||||
# moment. For compatibility with how the package handled this
|
||||
# previously, we will work around this race for a while longer.
|
||||
|
||||
if [ -x $MULTIPATH ] ; then
|
||||
# 1 second is too short for multipath devices to appear,
|
||||
# because multipathd takes more than 1s to activate them
|
||||
# after udevadm settle is done.
|
||||
sleep 3
|
||||
else
|
||||
sleep 1
|
||||
fi
|
||||
udevadm settle || true
|
||||
|
||||
# Handle LVM
|
||||
if [ -n "$LVMGROUPS" ] ; then
|
||||
if ! $VGCHANGE -ay $LVMGROUPS ; then
|
||||
echo "Warning: could not activate all LVM groups." >&2
|
||||
fi
|
||||
# Make sure we pick up all LVM devices
|
||||
udevadm settle || true
|
||||
fi
|
||||
|
||||
# Mount all network filesystems
|
||||
# (systemd takes care of it directly, so don't do it there)
|
||||
if ! [ -d /run/systemd/system ] ; then
|
||||
if [ $HANDLE_NETDEV -eq 1 ] ; then
|
||||
mount -a -O _netdev >/dev/null 2>&1 || true
|
||||
# FIXME: should we really support swap on iSCSI?
|
||||
# If so, we should update umountiscsi.sh!
|
||||
swapon -a -e >/dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
68
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/logout-all.sh
Executable file
68
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/logout-all.sh
Executable file
@ -0,0 +1,68 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# This script logs out from all active iSCSI sessions, excluding those
|
||||
# listed in /run/open-iscsi/shutdown-keep-sessions. That file is
|
||||
# generated by umountiscsi.sh and determines which sessions should not
|
||||
# be terminated.
|
||||
#
|
||||
|
||||
ISCSIADM=/usr/sbin/iscsiadm
|
||||
PIDFILE=/run/iscsid.pid
|
||||
|
||||
ISCSI_ROOT_KEEP_ALL_SESSIONS_AT_SHUTDOWN=0
|
||||
if [ -f /etc/default/open-iscsi ]; then
|
||||
. /etc/default/open-iscsi
|
||||
fi
|
||||
|
||||
if [ -f /etc/iscsi/iscsi.initramfs ] && [ $ISCSI_ROOT_KEEP_ALL_SESSIONS_AT_SHUTDOWN -eq 1 ]; then
|
||||
# Don't logout from any sessions if root is on initramfs and the
|
||||
# administrator wanted it that way.
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -s $PIDFILE ] || ! kill -0 `sed -n 1p $PIDFILE` >/dev/null 2>/dev/null ; then
|
||||
# Don't logout from iSCSI sessions if daemon isn't running
|
||||
echo "iSCSI initiator daemon not running, not logging out from targets." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
EXCLUDED_SESSIONS=""
|
||||
if [ -f /run/open-iscsi/shutdown-keep-sessions ] ; then
|
||||
_EXCLUDED_SESSIONS=$(cat /run/open-iscsi/shutdown-keep-sessions)
|
||||
for s in ${_EXCLUDED_SESSIONS} ; do
|
||||
EXCLUDED_SESSIONS="${EXCLUDED_SESSIONS:+$EXCLUDED_SESSIONS }${s}"
|
||||
done
|
||||
fi
|
||||
|
||||
# trivial case
|
||||
if [ -z "$EXCLUDED_SESSIONS" ] ; then
|
||||
$ISCSIADM -m node --logoutall=all
|
||||
exit $?
|
||||
fi
|
||||
|
||||
in_set() {
|
||||
eval _set=\$$1
|
||||
case "${_set}" in
|
||||
("$2"|*" $2"|"$2 "*|*" $2 "*) return 0 ;;
|
||||
(*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# go through all iSCSI sessions, but exclude those where we don't want
|
||||
# to logout from
|
||||
RC=0
|
||||
for host_dir in /sys/devices/platform/host* ; do
|
||||
[ -d "$host_dir"/iscsi_host* ] || continue
|
||||
for session_dir in "$host_dir"/session* ; do
|
||||
if in_set EXCLUDED_SESSIONS "$session_dir" ; then
|
||||
continue
|
||||
fi
|
||||
$ISCSIADM -m session -r "$session_dir" --logout
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ] ; then
|
||||
RC=1
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
exit $RC
|
||||
80
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/net-interface-handler
Executable file
80
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/net-interface-handler
Executable file
@ -0,0 +1,80 @@
|
||||
#!/bin/sh -e
|
||||
# suppress configuration of network interface used
|
||||
# by iSCSI root device
|
||||
#
|
||||
# If the root filesystem is on iSCSI, then we must take care to avoid
|
||||
# changing the state of its network interface. To this end, the initramfs
|
||||
# leaves a note for us which interface was used, and we mangle
|
||||
# /run/network/ifstate manually to stop it being brought up or down
|
||||
# automatically. This is a slight layering violation, but, unfortunately,
|
||||
# ifupdown appears to have no way to do this without also running
|
||||
# /etc/network/*.d/ scripts.
|
||||
|
||||
assert_interface() {
|
||||
# udev sets INTERFACE to the name of the currently-processed nic.
|
||||
[ -n "$INTERFACE" ] && return 0
|
||||
echo "environment variable INTERFACE not set." 1>&2;
|
||||
return 1
|
||||
}
|
||||
|
||||
start() {
|
||||
CR="
|
||||
"
|
||||
assert_interface || return
|
||||
ifile=/run/initramfs/open-iscsi.interface
|
||||
|
||||
[ -f "$ifile" ] && read iface < "$ifile" || return 0
|
||||
[ "$INTERFACE" = "$iface" ] || return
|
||||
|
||||
if ! grep -qs "^$iface=" /run/network/ifstate; then
|
||||
mkdir -p /run/network
|
||||
echo "$iface=$iface" >>/run/network/ifstate
|
||||
|
||||
if [ -f /run/net-$iface.conf ]; then
|
||||
conf=/run/net-$iface.conf
|
||||
elif [ -f /run/net6-$iface.conf ]; then
|
||||
conf=/run/net6-$iface.conf
|
||||
else
|
||||
conf=""
|
||||
fi
|
||||
if command -v resolvconf >/dev/null &&
|
||||
[ -n "$conf" ]; then
|
||||
. "$conf"
|
||||
R=""
|
||||
[ -n "$DOMAINSEARCH" ] && R="$R${CR}search $DOMAINSEARCH"
|
||||
[ -n "$IPV6DOMAINSEARCH" ] && R="$R${CR}search $IPV6DOMAINSEARCH"
|
||||
for ns in "$IPV4DNS0" "$IPV4DNS1" "$IPV6DNS0" "$IPV6DNS1"; do
|
||||
[ -n "$ns" -a "$ns" != "0.0.0.0" ] && R="$R${CR}nameserver $ns"
|
||||
done
|
||||
if [ -n "$R" ]; then
|
||||
# create the dir in case resolvconf did not start yet
|
||||
mkdir -p /run/resolvconf/interface
|
||||
resolvconf -a $iface.iscsi-network <<EOF
|
||||
${R#${CR}}
|
||||
EOF
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
stop() {
|
||||
assert_interface || return
|
||||
ifile=/run/initramfs/open-iscsi.interface
|
||||
[ -f "$ifile" ] && read iface < "$ifile" || return 0
|
||||
[ "$INTERFACE" = "$iface" ] || return
|
||||
|
||||
if grep -qs "^$iface=" /run/network/ifstate; then
|
||||
grep -v "^$iface=" /run/network/ifstate >/run/network/.ifstate.tmp || true
|
||||
mv /run/network/.ifstate.tmp /run/network/ifstate
|
||||
|
||||
if command -v resolvconf >/dev/null; then
|
||||
resolvconf -d $iface.iscsi-network
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
start) start ;;
|
||||
stop) stop ;;
|
||||
*) echo "ERROR: must be called with 'start' or 'stop'" >&2; exit 1 ;;
|
||||
esac
|
||||
59
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/startup-checks.sh
Executable file
59
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/startup-checks.sh
Executable file
@ -0,0 +1,59 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# This script does the required startup checks before the iSCSI
|
||||
# daemon should be started. It also generates a name if that
|
||||
# hadn't been done before.
|
||||
#
|
||||
|
||||
PATH=/usr/sbin:/sbin:/usr/bin:/bin
|
||||
|
||||
NAMEFILE=/etc/iscsi/initiatorname.iscsi
|
||||
CONFIGFILE=/etc/iscsi/iscsid.conf
|
||||
|
||||
if [ ! -e "$CONFIGFILE" ]; then
|
||||
echo >&2
|
||||
echo "Error: configuration file $CONFIGFILE is missing!" >&2
|
||||
echo "The iSCSI driver has not been correctly installed and cannot start." >&2
|
||||
echo >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $NAMEFILE ]; then
|
||||
echo >&2
|
||||
echo "Error: InitiatorName file $NAMEFILE is missing!" >&2
|
||||
echo "The iSCSI driver has not been correctly installed and cannot start." >&2
|
||||
echo >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# see if we need to generate a unique iSCSI InitiatorName
|
||||
if grep -q "^GenerateName=yes" $NAMEFILE ; then
|
||||
if [ ! -x /usr/sbin/iscsi-iname ] ; then
|
||||
echo "Error: /usr/sbin/iscsi-iname does not exist, driver was not successfully installed" >&2
|
||||
exit 1
|
||||
fi
|
||||
# Generate a unique InitiatorName and save it
|
||||
INAME=`/sbin/iscsi-iname -p iqn.2004-10.com.ubuntu:01`
|
||||
if [ "$INAME" != "" ] ; then
|
||||
echo "## DO NOT EDIT OR REMOVE THIS FILE!" > $NAMEFILE
|
||||
echo "## If you remove this file, the iSCSI daemon will not start." >> $NAMEFILE
|
||||
echo "## If you change the InitiatorName, existing access control lists" >> $NAMEFILE
|
||||
echo "## may reject this initiator. The InitiatorName must be unique">> $NAMEFILE
|
||||
echo "## for each iSCSI initiator. Do NOT duplicate iSCSI InitiatorNames." >> $NAMEFILE
|
||||
printf "InitiatorName=$INAME\n" >> $NAMEFILE
|
||||
chmod 600 $NAMEFILE
|
||||
else
|
||||
echo "Error: failed to generate an iSCSI InitiatorName, driver cannot start." >&2
|
||||
echo >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# make sure there is a valid InitiatorName for the driver
|
||||
if ! grep -q "^InitiatorName=[^ \t\n]" $NAMEFILE ; then
|
||||
echo >&2
|
||||
echo "Error: $NAMEFILE does not contain a valid InitiatorName." >&2
|
||||
echo "The iSCSI driver has not been correctly installed and cannot start." >&2
|
||||
echo >&2
|
||||
exit 1
|
||||
fi
|
||||
673
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/umountiscsi.sh
Executable file
673
overlays/rpi4-armbian-longhorn-root/usr/lib/open-iscsi/umountiscsi.sh
Executable file
@ -0,0 +1,673 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# This script umounts mounted iSCSI devices on shutdown, if possible.
|
||||
# It is supposed to catch most use cases but is not designed to work
|
||||
# for every corner-case. It handles LVM and multipath, but only if
|
||||
# one of the following stackings is used:
|
||||
# LVM -> multipath -> iSCSI
|
||||
# multipath -> iSCSI
|
||||
# LVM -> iSCSI
|
||||
# LVM -> LUKS -> multipath -> iSCSI
|
||||
# LVM -> LUKS -> iSCSI
|
||||
# LUKS -> LVM -> multipath -> iSCSI
|
||||
# LUKS -> multipath -> iSCSI
|
||||
# LUKS -> LVM -> iSCSI
|
||||
# LUKS -> iSCSI
|
||||
# It does not try to umount anything belonging to any device that is
|
||||
# also used as a backing store for the root filesystem. Any iSCSI
|
||||
# device part of the backing store of the root filesystem will be noted
|
||||
# in /run/open-iscsi/shutdown-keep-sessions, so that the session not be
|
||||
# closed on shutdown.
|
||||
#
|
||||
# KNOWN ISSUES:
|
||||
# - It doesn't handle submounts properly in all corner cases.
|
||||
# Specifically, it doesn't handle a non-iSCSI mount below an
|
||||
# iSCSI mount if it isn't also marked _netdev in /etc/fstab.
|
||||
# - It does not handle other things device mapper can do, such as
|
||||
# RAID, crypto, manual mappings of parts of disks, etc.
|
||||
# - It doesn't try to kill programs still accessing those mounts,
|
||||
# umount will just fail then.
|
||||
# - It doesn't handle more complicated stackings such as overlayfs,
|
||||
# FUSE filesystems, loop devices, etc.
|
||||
# - It doesn't handle swap.
|
||||
#
|
||||
# LONG TERM GOAL:
|
||||
# - In the long term, there should be a solution where for each part
|
||||
# of the stacking (device mapper, LVM, overlayfs, etc.) explicit
|
||||
# depdendencies are declared with the init system such that it can
|
||||
# be automatically dismantled. That would make this script
|
||||
# superfluous and also not be a layering violation, as it
|
||||
# currently is.
|
||||
#
|
||||
# Author: Christian Seiler <christian@iwakd.de>
|
||||
#
|
||||
|
||||
PATH=/usr/sbin:/sbin:/usr/bin:/bin
|
||||
|
||||
EXCLUDE_MOUNTS_AT_SHUTDOWN=""
|
||||
if [ -f /etc/default/open-iscsi ]; then
|
||||
. /etc/default/open-iscsi
|
||||
fi
|
||||
|
||||
MULTIPATH=/sbin/multipath
|
||||
PVS=/sbin/pvs
|
||||
LVS=/sbin/lvs
|
||||
VGS=/sbin/vgs
|
||||
VGCHANGE=/sbin/vgchange
|
||||
CRYPTSETUP=/sbin/cryptsetup
|
||||
DMSETUP=/sbin/dmsetup
|
||||
|
||||
if [ -x $PVS ] && [ -x $LVS ] && [ -x $VGCHANGE ] ; then
|
||||
HAVE_LVM=1
|
||||
else
|
||||
HAVE_LVM=0
|
||||
fi
|
||||
if [ -x $CRYPTSETUP ] && [ -x $DMSETUP ] ; then
|
||||
HAVE_LUKS=1
|
||||
else
|
||||
HAVE_LUKS=0
|
||||
fi
|
||||
|
||||
DRY_RUN=0
|
||||
|
||||
# We need to make sure that we don't try to umount the root device
|
||||
# and for systemd systems, also /usr (which is pre-mounted in initrd
|
||||
# there).
|
||||
EXCLUDE_MOUNTS="/"
|
||||
if [ -d /run/systemd/system ] ; then
|
||||
EXCLUDE_MOUNTS="$EXCLUDE_MOUNTS /usr"
|
||||
fi
|
||||
EXCLUDE_MOUNTS="${EXCLUDE_MOUNTS}${EXCLUDE_MOUNTS_AT_SHUTDOWN+ $EXCLUDE_MOUNTS_AT_SHUTDOWN}"
|
||||
unset _EXCLUDE_MOUNTS
|
||||
|
||||
error_usage() {
|
||||
echo "Usage: $0 [--dry-run | --timeout secs]" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
timeout=0
|
||||
|
||||
if [ $# -gt 2 ] ; then
|
||||
error_usage
|
||||
fi
|
||||
|
||||
if [ $# -eq 2 ] ; then
|
||||
if [ x"$1"x != x"--timeout"x ] ; then
|
||||
error_usage
|
||||
fi
|
||||
case "$2" in
|
||||
(-1) timeout="$2" ;;
|
||||
(*[!0-9]*|"") error_usage ;;
|
||||
(*) timeout="$2" ;;
|
||||
esac
|
||||
elif [ $# -eq 1 ] ; then
|
||||
if [ x"$1"x != x"--dry-run"x ] ; then
|
||||
error_usage
|
||||
fi
|
||||
DRY_RUN=1
|
||||
fi
|
||||
|
||||
# poor man's hash implementation using shell variables
|
||||
hash_keys() {
|
||||
_hash_keys_hash_key_prefix="${1}_"
|
||||
(
|
||||
IFS='='
|
||||
set | while read var value ; do
|
||||
if [ x"${var#$_hash_keys_hash_key_prefix}"x != x"${var}"x ] ; then
|
||||
printf '%s\n' "${var#$_hash_keys_hash_key_prefix}"
|
||||
fi
|
||||
done
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
hash_clear() {
|
||||
for k in $(hash_keys "$1") ; do
|
||||
unset "${1}_${k}"
|
||||
done
|
||||
}
|
||||
|
||||
hash_get() {
|
||||
_hash_get_var="$2_$(printf '%s' "$3" | sed 's%[^A-Za-z0-9_]%_%g')"
|
||||
eval _hash_get_value=\$${_hash_get_var}
|
||||
eval $1=\${_hash_get_value}
|
||||
}
|
||||
|
||||
hash_set() {
|
||||
_hash_set_var="$1_$(printf '%s' "$2" | sed 's%[^A-Za-z0-9_]%_%g')"
|
||||
eval ${_hash_set_var}=\${3}
|
||||
}
|
||||
|
||||
hash_unset() {
|
||||
_hash_set_var="$1_$(printf '%s' "$2" | sed 's%[^A-Za-z0-9_]%_%g')"
|
||||
unset ${_hash_set_var}
|
||||
}
|
||||
|
||||
in_set() {
|
||||
eval _set=\$$1
|
||||
case "${_set}" in
|
||||
("$2"|*" $2"|"$2 "*|*" $2 "*) return 0 ;;
|
||||
(*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
_add_to_set() {
|
||||
eval _set=\$$1
|
||||
case "${_set}" in
|
||||
("$2"|*" $2"|"$2 "*|*" $2 "*) ;;
|
||||
("") _set="$2" ;;
|
||||
(*) _set="${_set} $2" ;;
|
||||
esac
|
||||
eval $1=\${_set}
|
||||
}
|
||||
|
||||
add_to_set() {
|
||||
_add_to_set_set="$1"
|
||||
shift
|
||||
for _add_to_set_val in "$@" ; do
|
||||
_add_to_set "${_add_to_set_set}" "${_add_to_set_val}"
|
||||
done
|
||||
}
|
||||
|
||||
hash_add_to_set() {
|
||||
_hash_add_to_set_var="$1_$(printf '%s' "$2" | sed 's%[^A-Za-z0-9_]%_%g')"
|
||||
shift
|
||||
shift
|
||||
add_to_set "${_hash_add_to_set_var}" "$@"
|
||||
}
|
||||
|
||||
device_majmin() {
|
||||
eval $1=\"\"
|
||||
_majmin_dec=$(LC_ALL=C ls -lnd /dev/"$2" | while read _perms _links _uid _gid _majcomma _min _rest ; do
|
||||
if [ x"${_majcomma%,}"x != x"${_majcomma}"x ] ; then
|
||||
printf '%s' ${_majcomma%,}:${_min}
|
||||
fi
|
||||
break
|
||||
done)
|
||||
[ -n "${_majmin_dec}" ] || return
|
||||
eval $1=\${_majmin_dec}
|
||||
}
|
||||
|
||||
get_lvm_vgs() {
|
||||
# handle the case where we didn't get passed any PVs
|
||||
# at all
|
||||
[ $# -gt 0 ] || return 0
|
||||
# subshell for pwd change
|
||||
(
|
||||
cd /dev
|
||||
$PVS --noheadings -o vg_name "$@" 2>/dev/null
|
||||
)
|
||||
}
|
||||
|
||||
enumerate_luks() {
|
||||
hash_clear LUKS_DEVICES_REVERSE_MAP
|
||||
|
||||
_all_crypt_devices=$($DMSETUP info --noheadings -o name -c -S subsystem=CRYPT 2>/dev/null || :)
|
||||
for _crypt_device in ${_all_crypt_devices} ; do
|
||||
[ -b "/dev/mapper/${_crypt_device}" ] || continue
|
||||
_crypt_device="$(readlink -fe "/dev/mapper/${_crypt_device}" 2>/dev/null || :)"
|
||||
_crypt_device="${_crypt_device#/dev/}"
|
||||
[ -b "/dev/${_crypt_device}" ] || continue
|
||||
# dmsetup deps is weird, it outputs the following:
|
||||
# 1 dependencies : (XYZ)
|
||||
_dep=$($DMSETUP deps -o blkdevname "/dev/${_crypt_device}" | sed -n '1s%.*: (\(.*\)).*%\1%p')
|
||||
if [ -n "$_dep" ] && [ -b "/dev/${_dep}" ] ; then
|
||||
_dep="$(readlink -fe "/dev/$_dep" 2>/dev/null || :)"
|
||||
_dep="${_dep#/dev/}"
|
||||
fi
|
||||
if [ -n "$_dep" ] && [ -b "/dev/${_dep}" ] ; then
|
||||
hash_set LUKS_DEVICES_REVERSE_MAP "${_dep}" "${_crypt_device}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
enumerate_iscsi_devices() {
|
||||
# Empty arrays
|
||||
iscsi_disks=""
|
||||
iscsi_partitions=""
|
||||
iscsi_multipath_disks=""
|
||||
iscsi_multipath_disk_aliases=""
|
||||
iscsi_multipath_partitions=""
|
||||
iscsi_lvm_vgs=""
|
||||
iscsi_lvm_lvs=""
|
||||
iscsi_potential_mount_sources=""
|
||||
iscsi_luks_pass1=""
|
||||
iscsi_luks_pass2=""
|
||||
|
||||
hash_clear ISCSI_DEVICE_SESSIONS
|
||||
hash_clear ISCSI_MPALIAS_SESSIONS
|
||||
hash_clear ISCSI_LVMVG_SESSIONS
|
||||
hash_clear ISCSI_NUMDEVICE_SESSIONS
|
||||
ISCSI_EXCLUDED_SESSIONS=""
|
||||
|
||||
# We first need to generate a global reverse mapping of all
|
||||
# cryptsetup (e.g. LUKS) devices, because there's no easy way
|
||||
# to query "is this the encrypted backing of an active crypto
|
||||
# mapping?
|
||||
enumerate_luks
|
||||
|
||||
# Look for all iscsi disks
|
||||
for _host_dir in /sys/devices/platform/host* /sys/devices/pci*/*/*/host* ; do
|
||||
if ! [ -d "$_host_dir"/iscsi_host* ] || ! [ -d "$_host_dir"/iscsi_host/host* ] ; then
|
||||
continue
|
||||
fi
|
||||
for _session_dir in "$_host_dir"/session* ; do
|
||||
[ -d "$_session_dir"/target* ] || continue
|
||||
for _block_dev_dir in "$_session_dir"/target*/*\:*/block/* ; do
|
||||
_block_dev=${_block_dev_dir##*/}
|
||||
[ x"${_block_dev}"x != x"*"x ] || continue
|
||||
add_to_set iscsi_disks "${_block_dev}"
|
||||
hash_add_to_set ISCSI_DEVICE_SESSIONS "${_block_dev}" ${_session_dir}
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Look for all partitions on those disks
|
||||
for _disk in $iscsi_disks ; do
|
||||
hash_get _disk_sessions ISCSI_DEVICE_SESSIONS "${_disk}"
|
||||
for _part_dir in /sys/class/block/"${_disk}"/"${_disk}"?* ; do
|
||||
_part="${_part_dir##*/}"
|
||||
[ x"${_part}"x != x"${_disk}?*"x ] || continue
|
||||
add_to_set iscsi_partitions "${_part}"
|
||||
hash_set ISCSI_DEVICE_SESSIONS "${_part}" "${_disk_sessions}"
|
||||
done
|
||||
done
|
||||
|
||||
if [ -x $MULTIPATH ] ; then
|
||||
# Look for all multipath disks
|
||||
for _disk in $iscsi_disks ; do
|
||||
hash_get _disk_sessions ISCSI_DEVICE_SESSIONS "${_disk}"
|
||||
for _alias in $($MULTIPATH -v1 -l /dev/"$_disk") ; do
|
||||
_mp_dev="$(readlink -fe "/dev/mapper/${_alias}" || :)"
|
||||
[ -n "${_mp_dev}" ] || continue
|
||||
add_to_set iscsi_multipath_disks "${_mp_dev#/dev/}"
|
||||
add_to_set iscsi_multipath_disk_aliases "${_alias}"
|
||||
hash_add_to_set ISCSI_DEVICE_SESSIONS "${_mp_dev#/dev/}" ${_disk_sessions}
|
||||
hash_add_to_set ISCSI_MPALIAS_SESSIONS "${_alias}" ${_disk_sessions}
|
||||
done
|
||||
done
|
||||
|
||||
# Look for partitions on these multipath disks
|
||||
for _alias in $iscsi_multipath_disk_aliases ; do
|
||||
hash_get _mp_sessions ISCSI_MPALIAS_SESSIONS "${_alias}"
|
||||
for _part_name in /dev/mapper/"${_alias}"-part* ; do
|
||||
_part="$(readlink -fe "$_part_name" 2>/dev/null || :)"
|
||||
[ -n "${_part}" ] || continue
|
||||
add_to_set iscsi_multipath_partitions "${_part#/dev/}"
|
||||
hash_set ISCSI_DEVICE_SESSIONS "${_part#/dev/}" "${_mp_sessions}"
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $HAVE_LUKS -eq 1 ] ; then
|
||||
# Look for all LUKS devices.
|
||||
for _dev in $iscsi_disks $iscsi_partitions $iscsi_multipath_disks $iscsi_multipath_partitions ; do
|
||||
hash_get _luksDev LUKS_DEVICES_REVERSE_MAP "${_dev}"
|
||||
[ -n "${_luksDev}" ] || continue
|
||||
add_to_set iscsi_luks_pass1 "${_luksDev}"
|
||||
hash_get _currentSession ISCSI_DEVICE_SESSIONS "${_dev}"
|
||||
if [ -n "${_currentSession}" ] ; then
|
||||
hash_set ISCSI_DEVICE_SESSIONS "${_luksDev}" "${_currentSession}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $HAVE_LVM -eq 1 ] ; then
|
||||
# Look for all LVM volume groups that have a backing store
|
||||
# on any iSCSI device we found. Also, add $LVMGROUPS set in
|
||||
# /etc/default/open-iscsi (for more complicated stacking
|
||||
# configurations we don't automatically detect).
|
||||
for _vg in $(get_lvm_vgs $iscsi_disks $iscsi_partitions $iscsi_multipath_disks $iscsi_multipath_partitions $iscsi_luks_pass1) $LVMGROUPS ; do
|
||||
add_to_set iscsi_lvm_vgs "$_vg"
|
||||
done
|
||||
|
||||
# $iscsi_lvm_vgs is now unique list
|
||||
for _vg in $iscsi_lvm_vgs ; do
|
||||
# get PVs to track iSCSI sessions
|
||||
for _pv in $($VGS --noheadings -o pv_name "$_vg" 2>/dev/null) ; do
|
||||
_pv_dev="$(readlink -fe "$_pv" 2>/dev/null || :)"
|
||||
[ -n "${_pv_dev}" ] || continue
|
||||
hash_get _pv_sessions ISCSI_DEVICE_SESSIONS "${_pv_dev#/dev/}"
|
||||
hash_add_to_set ISCSI_LVMVG_SESSIONS "${_vg}" ${_pv_sessions}
|
||||
done
|
||||
|
||||
# now we collected all sessions belonging to this VG
|
||||
hash_get _vg_sessions ISCSI_LVMVG_SESSIONS "${_vg}"
|
||||
|
||||
# find all LVs
|
||||
for _lv in $($VGS --noheadings -o lv_name "$_vg" 2>/dev/null) ; do
|
||||
_dev="$(readlink -fe "/dev/${_vg}/${_lv}" 2>/dev/null || :)"
|
||||
[ -n "${_dev}" ] || continue
|
||||
iscsi_lvm_lvs="$iscsi_lvm_lvs ${_dev#/dev/}"
|
||||
hash_set ISCSI_DEVICE_SESSIONS "${_dev#/dev/}" "${_vg_sessions}"
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $HAVE_LUKS -eq 1 ] ; then
|
||||
# Look for all LUKS devices.
|
||||
for _dev in $iscsi_lvm_lvs ; do
|
||||
hash_get _luksDev LUKS_DEVICES_REVERSE_MAP "${_dev}"
|
||||
[ -n "${_luksDev}" ] || continue
|
||||
add_to_set iscsi_luks_pass2 "${_luksDev}"
|
||||
hash_get _currentSession ISCSI_DEVICE_SESSIONS "${_dev}"
|
||||
if [ -n "${_currentSession}" ] ; then
|
||||
hash_set ISCSI_DEVICE_SESSIONS "${_luksDev}" "${_currentSession}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Gather together all mount sources
|
||||
iscsi_potential_mount_sources="$iscsi_potential_mount_sources $iscsi_disks $iscsi_partitions"
|
||||
iscsi_potential_mount_sources="$iscsi_potential_mount_sources $iscsi_multipath_disks $iscsi_multipath_partitions"
|
||||
iscsi_potential_mount_sources="$iscsi_potential_mount_sources $iscsi_lvm_lvs"
|
||||
iscsi_potential_mount_sources="$iscsi_potential_mount_sources $iscsi_luks_pass1 $iscsi_luks_pass2"
|
||||
|
||||
# Convert them to numerical representation
|
||||
iscsi_potential_mount_sources_majmin=""
|
||||
for _src in $iscsi_potential_mount_sources ; do
|
||||
device_majmin _src_majmin "$_src"
|
||||
[ -n "$_src_majmin" ] || continue
|
||||
iscsi_potential_mount_sources_majmin="${iscsi_potential_mount_sources_majmin} ${_src_majmin}"
|
||||
hash_get _dev_sessions ISCSI_DEVICE_SESSIONS "${_src}"
|
||||
hash_set ISCSI_NUMDEVICE_SESSIONS "${_src_majmin}" "${_dev_sessions}"
|
||||
done
|
||||
|
||||
# Enumerate mount points
|
||||
iscsi_mount_points=""
|
||||
iscsi_mount_point_ids=""
|
||||
while read _mpid _mppid _mpdev _mpdevpath _mppath _mpopts _other ; do
|
||||
if in_set iscsi_potential_mount_sources_majmin "$_mpdev" ; then
|
||||
if in_set EXCLUDE_MOUNTS "${_mppath}" ; then
|
||||
hash_get _dev_sessions ISCSI_NUMDEVICE_SESSIONS "${_mpdev}"
|
||||
add_to_set ISCSI_EXCLUDED_SESSIONS $_dev_sessions
|
||||
continue
|
||||
fi
|
||||
# list mountpoints in reverse order (in case
|
||||
# some are stacked) mount --move may cause the
|
||||
# order of /proc/self/mountinfo to not always
|
||||
# reflect the stacking order, so this is not
|
||||
# fool-proof, but it's better than nothing
|
||||
iscsi_mount_points="$_mppath $iscsi_mount_points"
|
||||
iscsi_mount_point_ids="$_mpid $iscsi_mount_points"
|
||||
fi
|
||||
done < /proc/self/mountinfo
|
||||
}
|
||||
|
||||
try_umount() {
|
||||
# in order to handle stacking try twice; together with the fact
|
||||
# that the list of mount points is in reverse order of the
|
||||
# contents /proc/self/mountinfo this should catch most cases
|
||||
for retry in 1 2 ; do
|
||||
for path in $iscsi_mount_points ; do
|
||||
# first try to see if it really is a mountpoint
|
||||
# still (might be the second round this is done
|
||||
# and the mount is already gone, or something
|
||||
# else umounted it first)
|
||||
if ! fstab-decode mountpoint -q "$path" ; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# try to umount it
|
||||
if ! fstab-decode umount "$path" ; then
|
||||
# unfortunately, umount's exit code
|
||||
# may be a false negative, i.e. it
|
||||
# might give a failure exit code, even
|
||||
# though it succeeded, so check again
|
||||
if fstab-decode mountpoint -q "$path" ; then
|
||||
echo "Could not unmount $path" >&2
|
||||
any_umount_failed=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
try_deactivate_lvm() {
|
||||
[ $HAVE_LVM -eq 1 ] || return
|
||||
|
||||
for vg in $iscsi_lvm_vgs ; do
|
||||
vg_excluded=0
|
||||
hash_get vg_sessions ISCSI_LVMVG_SESSIONS "$vg"
|
||||
for vg_session in $vg_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$vg_session" ; then
|
||||
vg_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $vg_excluded -eq 1 ] ; then
|
||||
# volume group on same iSCSI session as excluded
|
||||
# mount, don't disable it
|
||||
# (FIXME: we should only exclude VGs that contain
|
||||
# those mounts, not also those that happen to be
|
||||
# in the same iSCSI session)
|
||||
continue
|
||||
fi
|
||||
if ! $VGCHANGE --available=n $vg ; then
|
||||
# Make sure the volume group (still) exists. If
|
||||
# it doesn't we count that as deactivated, so
|
||||
# don't fail then.
|
||||
_vg_test=$(vgs -o vg_name --noheadings $vg 2>/dev/null || :)
|
||||
if [ -n "${_vg_test}" ] ; then
|
||||
echo "Cannot deactivate Volume Group $vg" >&2
|
||||
any_umount_failed=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
try_dismantle_multipath() {
|
||||
[ -x $MULTIPATH ] || return
|
||||
|
||||
for mpalias in $iscsi_multipath_disk_aliases ; do
|
||||
mp_excluded=0
|
||||
hash_get mp_sessions ISCSI_MPALIAS_SESSIONS "$mpalias"
|
||||
for mp_session in $mp_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$mp_session" ; then
|
||||
mp_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $mp_excluded -eq 1 ] ; then
|
||||
# multipath device on same iSCSI session as
|
||||
# excluded mount, don't disable it
|
||||
# (FIXME: we should only exclude multipath mounts
|
||||
# that contain those mounts, not also those that
|
||||
# happen to be in the same iSCSI session)
|
||||
continue
|
||||
fi
|
||||
if ! $MULTIPATH -f $mpalias ; then
|
||||
echo "Cannot dismantle Multipath Device $mpalias" >&2
|
||||
any_umount_failed=1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
try_dismantle_luks() {
|
||||
[ $HAVE_LUKS -eq 1 ] || return
|
||||
case "$1" in
|
||||
1) iscsi_luks_current_pass="$iscsi_luks_pass1" ;;
|
||||
2|*) iscsi_luks_current_pass="$iscsi_luks_pass2" ;;
|
||||
esac
|
||||
|
||||
for luksDev in $iscsi_luks_current_pass ; do
|
||||
luks_excluded=0
|
||||
hash_get device_sessions ISCSI_DEVICE_SESSIONS "$luksDev"
|
||||
for device_session in $device_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$device_session" ; then
|
||||
luks_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $luks_excluded -eq 1 ] ; then
|
||||
continue
|
||||
fi
|
||||
_luksName="$($DMSETUP info -c --noheadings -o name /dev/"$luksDev" 2>/dev/null || :)"
|
||||
[ -n "${_luksName}" ] || continue
|
||||
if ! $CRYPTSETUP close "${_luksName}" ; then
|
||||
echo "Cannot dismantle cryptsetup device ${_luksName}" >&2
|
||||
any_umount_failed=1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# Don't do this if we are using systemd as init system, since systemd
|
||||
# takes care of network filesystems (including those marked _netdev) by
|
||||
# itself.
|
||||
if ! [ -d /run/systemd/system ] && [ $HANDLE_NETDEV -eq 1 ] && [ $DRY_RUN -eq 0 ]; then
|
||||
echo "Unmounting all devices marked _netdev";
|
||||
umount -a -O _netdev >/dev/null 2>&1
|
||||
fi
|
||||
|
||||
enumerate_iscsi_devices
|
||||
|
||||
# Dry run? Just print what we want to do (useful for administrator to check).
|
||||
if [ $DRY_RUN -eq 1 ] ; then
|
||||
echo "$0: would umount the following mount points:"
|
||||
had_mount=0
|
||||
if [ -n "$iscsi_mount_points" ] ; then
|
||||
for v in $iscsi_mount_points ; do
|
||||
echo " $v"
|
||||
had_mount=1
|
||||
done
|
||||
fi
|
||||
[ $had_mount -eq 1 ] || echo " (none)"
|
||||
|
||||
echo "$0: would disable the following LUKS devices (second pass):"
|
||||
had_luks=0
|
||||
if [ -n "$iscsi_luks_pass2" ] ; then
|
||||
for v in ${iscsi_luks_pass2} ; do
|
||||
luks_excluded=0
|
||||
hash_get device_sessions ISCSI_DEVICE_SESSIONS "$v"
|
||||
for device_session in $device_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$device_session" ; then
|
||||
luks_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $luks_excluded -eq 1 ] ; then
|
||||
continue
|
||||
fi
|
||||
_luksName="$($DMSETUP info -c --noheadings -o name /dev/"$v" 2>/dev/null || :)"
|
||||
[ -n "${_luksName}" ] || continue
|
||||
echo " ${_luksName}"
|
||||
had_luks=1
|
||||
done
|
||||
fi
|
||||
[ $had_luks -eq 1 ] || echo " (none)"
|
||||
|
||||
echo "$0: would deactivate the following LVM Volume Groups:"
|
||||
had_vg=0
|
||||
if [ -n "$iscsi_lvm_vgs" ] ; then
|
||||
for v in $iscsi_lvm_vgs ; do
|
||||
# sync this exclusion logic with try_deactivate_lvm
|
||||
vg_excluded=0
|
||||
hash_get vg_sessions ISCSI_LVMVG_SESSIONS "$v"
|
||||
for vg_session in $vg_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$vg_session" ; then
|
||||
vg_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $vg_excluded -eq 1 ] ; then
|
||||
continue
|
||||
fi
|
||||
echo " $v"
|
||||
had_vg=1
|
||||
done
|
||||
fi
|
||||
[ $had_vg -eq 1 ] || echo " (none)"
|
||||
|
||||
echo "$0: would disable the following LUKS devices (first pass):"
|
||||
had_luks=0
|
||||
if [ -n "$iscsi_luks_pass1" ] ; then
|
||||
for v in ${iscsi_luks_pass1} ; do
|
||||
luks_excluded=0
|
||||
hash_get device_sessions ISCSI_DEVICE_SESSIONS "$v"
|
||||
for device_session in $device_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$device_session" ; then
|
||||
luks_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $luks_excluded -eq 1 ] ; then
|
||||
continue
|
||||
fi
|
||||
_luksName="$($DMSETUP info -c --noheadings -o name /dev/"$v" 2>/dev/null || :)"
|
||||
[ -n "${_luksName}" ] || continue
|
||||
echo " ${_luksName}"
|
||||
had_luks=1
|
||||
done
|
||||
fi
|
||||
[ $had_luks -eq 1 ] || echo " (none)"
|
||||
|
||||
echo "$0: would deactivate the following multipath volumes:"
|
||||
had_mp=0
|
||||
if [ -n "$iscsi_multipath_disk_aliases" ] ; then
|
||||
for v in $iscsi_multipath_disk_aliases ; do
|
||||
# sync this exclusion logic with try_dismantle_multipath
|
||||
mp_excluded=0
|
||||
hash_get mp_sessions ISCSI_MPALIAS_SESSIONS "$v"
|
||||
for mp_session in $mp_sessions ; do
|
||||
if in_set ISCSI_EXCLUDED_SESSIONS "$mp_session" ; then
|
||||
mp_excluded=1
|
||||
fi
|
||||
done
|
||||
if [ $mp_excluded -eq 1 ] ; then
|
||||
continue
|
||||
fi
|
||||
echo " $v"
|
||||
had_mp=1
|
||||
done
|
||||
fi
|
||||
[ $had_mp -eq 1 ] || echo " (none)"
|
||||
|
||||
if [ -n "$ISCSI_EXCLUDED_SESSIONS" ] ; then
|
||||
echo "$0: the following sessions are excluded from disconnection (because / or another excluded mount is on them):"
|
||||
for v in $ISCSI_EXCLUDED_SESSIONS ; do
|
||||
echo " $v"
|
||||
done
|
||||
fi
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# after our first enumeration, write out a list of sessions that
|
||||
# shouldn't be terminated because excluded mounts are on those
|
||||
# sessions
|
||||
if [ -n "$ISCSI_EXCLUDED_SESSIONS" ] ; then
|
||||
mkdir -p -m 0700 /run/open-iscsi
|
||||
for session in $ISCSI_EXCLUDED_SESSIONS ; do
|
||||
printf '%s\n' $session
|
||||
done > /run/open-iscsi/shutdown-keep-sessions
|
||||
else
|
||||
# make sure there's no leftover from a previous call
|
||||
rm -f /run/open-iscsi/shutdown-keep-sessions
|
||||
fi
|
||||
|
||||
any_umount_failed=0
|
||||
try_umount
|
||||
try_dismantle_luks 2
|
||||
try_deactivate_lvm
|
||||
try_dismantle_luks 1
|
||||
try_dismantle_multipath
|
||||
|
||||
while [ $any_umount_failed -ne 0 ] && ( [ $timeout -gt 0 ] || [ $timeout -eq -1 ] ) ; do
|
||||
# wait a bit, perhaps there was still a program that
|
||||
# was terminating
|
||||
sleep 1
|
||||
|
||||
# try again and decrease timeout
|
||||
enumerate_iscsi_devices
|
||||
any_umount_failed=0
|
||||
try_umount
|
||||
try_dismantle_luks 2
|
||||
try_deactivate_lvm
|
||||
try_dismantle_luks 1
|
||||
try_dismantle_multipath
|
||||
if [ $timeout -gt 0 ] ; then
|
||||
timeout=$((timeout - 1))
|
||||
fi
|
||||
done
|
||||
|
||||
# Create signaling file (might be useful)
|
||||
if [ $any_umount_failed -eq 1 ] ; then
|
||||
touch /run/open-iscsi/some_umount_failed
|
||||
else
|
||||
rm -f /run/open-iscsi/some_umount_failed
|
||||
fi
|
||||
exit $any_umount_failed
|
||||
@ -0,0 +1,19 @@
|
||||
[Unit]
|
||||
Description=iSCSI initiator daemon (iscsid)
|
||||
Documentation=man:iscsid(8)
|
||||
Wants=network-online.target remote-fs-pre.target
|
||||
Before=remote-fs-pre.target
|
||||
After=network.target network-online.target
|
||||
DefaultDependencies=no
|
||||
Conflicts=shutdown.target
|
||||
Before=shutdown.target
|
||||
ConditionVirtualization=!private-users
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
PIDFile=/run/iscsid.pid
|
||||
ExecStartPre=/usr/lib/open-iscsi/startup-checks.sh
|
||||
ExecStart=/usr/sbin/iscsid
|
||||
|
||||
[Install]
|
||||
WantedBy=sysinit.target
|
||||
@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Open-iSCSI iscsid Socket
|
||||
Documentation=man:iscsid(8) man:iscsiadm(8)
|
||||
|
||||
[Socket]
|
||||
ListenStream=@ISCSIADM_ABSTRACT_NAMESPACE
|
||||
|
||||
[Install]
|
||||
WantedBy=sockets.target
|
||||
@ -0,0 +1,31 @@
|
||||
[Unit]
|
||||
Description=Login to default iSCSI targets
|
||||
Documentation=man:iscsiadm(8) man:iscsid(8)
|
||||
Wants=network-online.target remote-fs-pre.target
|
||||
After=network-online.target iscsid.service
|
||||
Before=remote-fs-pre.target
|
||||
DefaultDependencies=no
|
||||
Conflicts=shutdown.target
|
||||
Before=shutdown.target
|
||||
# Must have some pre-defined targets to login to
|
||||
ConditionDirectoryNotEmpty=|/etc/iscsi/nodes
|
||||
# or have a session to use via iscsid
|
||||
ConditionDirectoryNotEmpty=|/sys/class/iscsi_session
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=true
|
||||
# iscsiadm --login will return 21 if no nodes are configured,
|
||||
# and 15 if a session is alread logged in (which we do not
|
||||
# consider an error)
|
||||
SuccessExitStatus=15 21
|
||||
# Note: iscsid will be socket activated by iscsiadm
|
||||
ExecStart=/usr/sbin/iscsiadm -m node --loginall=automatic
|
||||
ExecStart=/usr/lib/open-iscsi/activate-storage.sh
|
||||
ExecStop=/usr/lib/open-iscsi/umountiscsi.sh
|
||||
ExecStop=/bin/sync
|
||||
ExecStop=/usr/lib/open-iscsi/logout-all.sh
|
||||
|
||||
[Install]
|
||||
WantedBy=sysinit.target
|
||||
Alias=iscsi.service
|
||||
@ -0,0 +1,3 @@
|
||||
# run before 80-networking.rules to run before ifupdown
|
||||
SUBSYSTEM=="net", ACTION=="add", RUN+="/usr/lib/open-iscsi/net-interface-handler start"
|
||||
SUBSYSTEM=="net", ACTION=="remove", RUN+="/usr/lib/open-iscsi/net-interface-handler stop"
|
||||
@ -0,0 +1,3 @@
|
||||
# When iscsi disks are present, iscsid.service should be running. LP: #1802354
|
||||
# ID_PATH looks like ip-<ipv4-dotted-quad>:<port>-iscsi-<target>-lun-<lun>
|
||||
SUBSYSTEM=="block", ACTION=="add", ENV{ID_PATH}=="*-iscsi-*", ENV{SYSTEMD_WANTS}+="iscsid.service"
|
||||
BIN
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s
Executable file
BIN
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s
Executable file
Binary file not shown.
77
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s-agent-uninstall.sh
Executable file
77
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s-agent-uninstall.sh
Executable file
@ -0,0 +1,77 @@
|
||||
#!/bin/sh
|
||||
set -x
|
||||
[ $(id -u) -eq 0 ] || exec sudo --preserve-env=K3S_DATA_DIR $0 $@
|
||||
|
||||
K3S_DATA_DIR=${K3S_DATA_DIR:-/var/lib/rancher/k3s}
|
||||
|
||||
/usr/local/bin/k3s-killall.sh
|
||||
|
||||
if command -v systemctl; then
|
||||
systemctl disable k3s-agent
|
||||
systemctl reset-failed k3s-agent
|
||||
systemctl daemon-reload
|
||||
fi
|
||||
if command -v rc-update; then
|
||||
rc-update delete k3s-agent default
|
||||
fi
|
||||
|
||||
rm -f /etc/systemd/system/k3s-agent.service
|
||||
rm -f /etc/systemd/system/k3s-agent.service.env
|
||||
|
||||
remove_uninstall() {
|
||||
rm -f /usr/local/bin/k3s-agent-uninstall.sh
|
||||
}
|
||||
trap remove_uninstall EXIT
|
||||
|
||||
if (ls /etc/systemd/system/k3s*.service || ls /etc/init.d/k3s*) >/dev/null 2>&1; then
|
||||
set +x; echo 'Additional k3s services installed, skipping uninstall of k3s'; set -x
|
||||
exit
|
||||
fi
|
||||
|
||||
for cmd in kubectl crictl ctr; do
|
||||
if [ -L /usr/local/bin/$cmd ]; then
|
||||
rm -f /usr/local/bin/$cmd
|
||||
fi
|
||||
done
|
||||
|
||||
clean_mounted_directory() {
|
||||
if ! grep -q " $1" /proc/mounts; then
|
||||
rm -rf "$1"
|
||||
return 0
|
||||
fi
|
||||
|
||||
for path in "$1"/*; do
|
||||
if [ -d "$path" ]; then
|
||||
if grep -q " $path" /proc/mounts; then
|
||||
clean_mounted_directory "$path"
|
||||
else
|
||||
rm -rf "$path"
|
||||
fi
|
||||
else
|
||||
rm "$path"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
rm -rf /etc/rancher/k3s
|
||||
rm -rf /run/k3s
|
||||
rm -rf /run/flannel
|
||||
clean_mounted_directory ${K3S_DATA_DIR}
|
||||
rm -rf /var/lib/kubelet
|
||||
rm -f /usr/local/bin/k3s
|
||||
rm -f /usr/local/bin/k3s-killall.sh
|
||||
|
||||
if type yum >/dev/null 2>&1; then
|
||||
yum remove -y k3s-selinux
|
||||
rm -f /etc/yum.repos.d/rancher-k3s-common*.repo
|
||||
elif type rpm-ostree >/dev/null 2>&1; then
|
||||
rpm-ostree uninstall k3s-selinux
|
||||
rm -f /etc/yum.repos.d/rancher-k3s-common*.repo
|
||||
elif type zypper >/dev/null 2>&1; then
|
||||
uninstall_cmd="zypper remove -y k3s-selinux"
|
||||
if [ "${TRANSACTIONAL_UPDATE=false}" != "true" ] && [ -x /usr/sbin/transactional-update ]; then
|
||||
uninstall_cmd="transactional-update --no-selfupdate -d run $uninstall_cmd"
|
||||
fi
|
||||
sudo $uninstall_cmd
|
||||
rm -f /etc/zypp/repos.d/rancher-k3s-common*.repo
|
||||
fi
|
||||
91
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s-killall.sh
Executable file
91
overlays/rpi4-armbian-longhorn-root/usr/local/bin/k3s-killall.sh
Executable file
@ -0,0 +1,91 @@
|
||||
#!/bin/sh
|
||||
[ $(id -u) -eq 0 ] || exec sudo --preserve-env=K3S_DATA_DIR $0 $@
|
||||
|
||||
K3S_DATA_DIR=${K3S_DATA_DIR:-/var/lib/rancher/k3s}
|
||||
|
||||
for bin in ${K3S_DATA_DIR}/data/**/bin/; do
|
||||
[ -d $bin ] && export PATH=$PATH:$bin:$bin/aux
|
||||
done
|
||||
|
||||
set -x
|
||||
|
||||
for service in /etc/systemd/system/k3s*.service; do
|
||||
[ -s $service ] && systemctl stop $(basename $service)
|
||||
done
|
||||
|
||||
for service in /etc/init.d/k3s*; do
|
||||
[ -x $service ] && $service stop
|
||||
done
|
||||
|
||||
pschildren() {
|
||||
ps -e -o ppid= -o pid= | \
|
||||
sed -e 's/^\s*//g; s/\s\s*/\t/g;' | \
|
||||
grep -w "^$1" | \
|
||||
cut -f2
|
||||
}
|
||||
|
||||
pstree() {
|
||||
for pid in $@; do
|
||||
echo $pid
|
||||
for child in $(pschildren $pid); do
|
||||
pstree $child
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
killtree() {
|
||||
kill -9 $(
|
||||
{ set +x; } 2>/dev/null;
|
||||
pstree $@;
|
||||
set -x;
|
||||
) 2>/dev/null
|
||||
}
|
||||
|
||||
remove_interfaces() {
|
||||
# Delete network interface(s) that match 'master cni0'
|
||||
ip link show 2>/dev/null | grep 'master cni0' | while read ignore iface ignore; do
|
||||
iface=${iface%%@*}
|
||||
[ -z "$iface" ] || ip link delete $iface
|
||||
done
|
||||
|
||||
# Delete cni related interfaces
|
||||
ip link delete cni0
|
||||
ip link delete flannel.1
|
||||
ip link delete flannel-v6.1
|
||||
ip link delete kube-ipvs0
|
||||
ip link delete flannel-wg
|
||||
ip link delete flannel-wg-v6
|
||||
|
||||
# Restart tailscale
|
||||
if [ -n "$(command -v tailscale)" ]; then
|
||||
tailscale set --advertise-routes=
|
||||
fi
|
||||
}
|
||||
|
||||
getshims() {
|
||||
ps -e -o pid= -o args= | sed -e 's/^ *//; s/\s\s*/\t/;' | grep -w "${K3S_DATA_DIR}"'/data/[^/]*/bin/containerd-shim' | cut -f1
|
||||
}
|
||||
|
||||
killtree $({ set +x; } 2>/dev/null; getshims; set -x)
|
||||
|
||||
do_unmount_and_remove() {
|
||||
set +x
|
||||
while read -r _ path _; do
|
||||
case "$path" in $1*) echo "$path" ;; esac
|
||||
done < /proc/self/mounts | sort -r | xargs -r -t -n 1 sh -c 'umount -f "$0" && rm -rf "$0"'
|
||||
set -x
|
||||
}
|
||||
|
||||
do_unmount_and_remove '/run/k3s'
|
||||
do_unmount_and_remove '/var/lib/kubelet/pods'
|
||||
do_unmount_and_remove '/var/lib/kubelet/plugins'
|
||||
do_unmount_and_remove '/run/netns/cni-'
|
||||
|
||||
# Remove CNI namespaces
|
||||
ip netns show 2>/dev/null | grep cni- | xargs -r -t -n 1 ip netns delete
|
||||
|
||||
remove_interfaces
|
||||
|
||||
rm -rf /var/lib/cni/
|
||||
iptables-save | grep -v KUBE- | grep -v CNI- | grep -iv flannel | iptables-restore
|
||||
ip6tables-save | grep -v KUBE- | grep -v CNI- | grep -iv flannel | ip6tables-restore
|
||||
@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
marker="/var/lib/metis/rpi4-longhorn-firstboot.done"
|
||||
env_file="/etc/metis/firstboot.env"
|
||||
key_file="/etc/metis/authorized_keys"
|
||||
fstab_append="/etc/metis/fstab.append"
|
||||
default_groups=(tty disk dialout sudo audio video plugdev games users systemd-journal input render netdev)
|
||||
|
||||
exec > >(tee -a /var/log/metis-rpi4-longhorn-firstboot.log) 2>&1
|
||||
|
||||
retry_cmd() {
|
||||
local attempts="$1"
|
||||
shift
|
||||
local try=1
|
||||
until "$@"; do
|
||||
if [ "${try}" -ge "${attempts}" ]; then
|
||||
return 1
|
||||
fi
|
||||
try=$((try + 1))
|
||||
sleep 5
|
||||
done
|
||||
}
|
||||
|
||||
ensure_network_access() {
|
||||
retry_cmd 12 sh -c 'ip route get 1.1.1.1 >/dev/null 2>&1'
|
||||
}
|
||||
|
||||
if [ -f "${marker}" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p /var/lib/metis /mnt/astreae /mnt/asteria
|
||||
|
||||
if [ -f "${env_file}" ]; then
|
||||
# shellcheck disable=SC1090
|
||||
. "${env_file}"
|
||||
fi
|
||||
|
||||
metis_hostname="${METIS_HOSTNAME:-}"
|
||||
metis_ssh_user="${METIS_SSH_USER:-atlas}"
|
||||
metis_k3s_version="${METIS_K3S_VERSION:-}"
|
||||
|
||||
if [ -n "${metis_hostname}" ]; then
|
||||
hostnamectl set-hostname "${metis_hostname}" || true
|
||||
fi
|
||||
|
||||
if command -v nmcli >/dev/null 2>&1; then
|
||||
retry_cmd 10 sh -c 'nmcli general status >/dev/null 2>&1'
|
||||
nmcli connection reload || true
|
||||
while IFS=: read -r name type device; do
|
||||
[ "${device}" = "end0" ] || continue
|
||||
[ "${name}" = "end0-static" ] && continue
|
||||
case "${type}" in
|
||||
ethernet|802-3-ethernet)
|
||||
nmcli connection modify "${name}" connection.autoconnect no || true
|
||||
;;
|
||||
esac
|
||||
done < <(nmcli -t -f NAME,TYPE,DEVICE connection show 2>/dev/null || true)
|
||||
nmcli connection up end0-static || true
|
||||
elif [ -f /etc/systemd/network/10-end0-static.network ]; then
|
||||
systemctl enable systemd-networkd.service || true
|
||||
systemctl restart systemd-networkd.service || true
|
||||
systemctl restart systemd-networkd-wait-online.service || true
|
||||
fi
|
||||
|
||||
if [ -f "${fstab_append}" ]; then
|
||||
while IFS= read -r line; do
|
||||
[ -z "${line}" ] && continue
|
||||
grep -Fqx "${line}" /etc/fstab || printf '%s\n' "${line}" >> /etc/fstab
|
||||
done < "${fstab_append}"
|
||||
fi
|
||||
|
||||
mount -a || true
|
||||
|
||||
packages=()
|
||||
if ! command -v sshd >/dev/null 2>&1; then
|
||||
packages+=("openssh-server")
|
||||
fi
|
||||
if ! command -v mount.nfs >/dev/null 2>&1; then
|
||||
packages+=("nfs-common")
|
||||
fi
|
||||
if ! command -v iscsiadm >/dev/null 2>&1; then
|
||||
packages+=("open-iscsi")
|
||||
fi
|
||||
if [ "${#packages[@]}" -gt 0 ]; then
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
ensure_network_access
|
||||
retry_cmd 5 apt-get update
|
||||
retry_cmd 5 apt-get install -y --no-install-recommends "${packages[@]}"
|
||||
fi
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl enable ssh.socket || systemctl enable ssh.service || true
|
||||
systemctl restart ssh.socket || systemctl restart ssh.service || systemctl start ssh.socket || systemctl start ssh.service || true
|
||||
mkdir -p /etc/iscsi /etc/iscsi/nodes /etc/iscsi/send_targets
|
||||
if [ ! -s /etc/iscsi/initiatorname.iscsi ] && command -v iscsi-iname >/dev/null 2>&1; then
|
||||
printf 'InitiatorName=%s\n' "$(iscsi-iname)" > /etc/iscsi/initiatorname.iscsi
|
||||
fi
|
||||
systemctl enable --now iscsid.socket || true
|
||||
systemctl enable --now open-iscsi.service || true
|
||||
|
||||
if [ -s "${key_file}" ]; then
|
||||
install -d -m 700 /root/.ssh
|
||||
install -m 600 "${key_file}" /root/.ssh/authorized_keys
|
||||
|
||||
if [ -n "${metis_ssh_user}" ]; then
|
||||
group_list=()
|
||||
for group_name in "${default_groups[@]}"; do
|
||||
if getent group "${group_name}" >/dev/null 2>&1; then
|
||||
group_list+=("${group_name}")
|
||||
fi
|
||||
done
|
||||
if [ "${#group_list[@]}" -gt 0 ]; then
|
||||
group_csv="$(IFS=,; printf '%s' "${group_list[*]}")"
|
||||
else
|
||||
group_csv=""
|
||||
fi
|
||||
|
||||
if ! id "${metis_ssh_user}" >/dev/null 2>&1; then
|
||||
if [ -n "${group_csv}" ]; then
|
||||
useradd -m -s /bin/bash -G "${group_csv}" "${metis_ssh_user}"
|
||||
else
|
||||
useradd -m -s /bin/bash "${metis_ssh_user}"
|
||||
fi
|
||||
elif [ -n "${group_csv}" ]; then
|
||||
usermod -a -G "${group_csv}" "${metis_ssh_user}" || true
|
||||
fi
|
||||
|
||||
install -d -m 700 -o "${metis_ssh_user}" -g "${metis_ssh_user}" "/home/${metis_ssh_user}/.ssh"
|
||||
install -m 600 -o "${metis_ssh_user}" -g "${metis_ssh_user}" "${key_file}" "/home/${metis_ssh_user}/.ssh/authorized_keys"
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f /root/.not_logged_in_yet
|
||||
|
||||
if ! command -v k3s >/dev/null 2>&1; then
|
||||
installer_env=("INSTALL_K3S_EXEC=agent")
|
||||
if [ -n "${metis_k3s_version}" ]; then
|
||||
installer_env+=("INSTALL_K3S_VERSION=${metis_k3s_version}")
|
||||
fi
|
||||
ensure_network_access
|
||||
retry_cmd 5 env "${installer_env[@]}" sh -c 'curl -sfL https://get.k3s.io | sh -'
|
||||
fi
|
||||
|
||||
systemctl enable k3s-agent
|
||||
systemctl restart k3s-agent || systemctl start k3s-agent
|
||||
|
||||
touch "${marker}"
|
||||
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsi-iname
Executable file
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsi-iname
Executable file
Binary file not shown.
195
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsi_discovery
Executable file
195
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsi_discovery
Executable file
@ -0,0 +1,195 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (C) Voltaire Ltd. 2006. ALL RIGHTS RESERVED.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
#
|
||||
# Author: Dan Bar Dov <danb@voltaire.com>
|
||||
|
||||
# iscsi_discovery:
|
||||
# * does a send-targets discovery to the given IP
|
||||
# * set the transport type to the preferred transport (or tcp is -t flag is not used)
|
||||
# * tries to login
|
||||
# * if succeeds,
|
||||
# o logout,
|
||||
# o mark record autmatic (unless -m flag is used)
|
||||
# * else
|
||||
# o reset transport type to TCP
|
||||
# o try to login
|
||||
# o if succeeded
|
||||
# + logout
|
||||
# + mark record automatic (unless -m flag is used)
|
||||
#
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "Usage: $0 <IP> [-p <port>] [-d] [-t <tcp|iser> [-f]] [-m] [-l]"
|
||||
echo "Options:"
|
||||
echo "-p set the port number (default is 3260)."
|
||||
echo "-d print debugging information"
|
||||
echo "-t set transport (default is tcp)."
|
||||
echo "-f force specific transport -disable the fallback to tcp (default is fallback enabled)."
|
||||
echo " force the transport specified by the argument of the -t flag."
|
||||
echo "-m manual startup - will set manual startup (default is automatic startup)."
|
||||
echo "-l login to the new discovered nodes (default is false)."
|
||||
}
|
||||
|
||||
dbg()
|
||||
{
|
||||
$debug && echo $@
|
||||
}
|
||||
|
||||
initialize()
|
||||
{
|
||||
trap "exit" 2
|
||||
debug=false
|
||||
force="0"
|
||||
log_out="1"
|
||||
startup_manual="0"
|
||||
#set default transport to tcp
|
||||
transport=tcp
|
||||
#set default port to 3260
|
||||
port=3260;
|
||||
}
|
||||
|
||||
parse_cmdline()
|
||||
{
|
||||
if [ $# -lt 1 ]; then
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the IP address is valid
|
||||
ip=`echo $1 | awk -F'.' '$1 != "" && $1 <=255 && $2 != "" && $2 <= 255 && $3 != "" && $3 <= 255 && $4 != "" && $4 <= 255 {print $0}'`
|
||||
if [ -z "$ip" ]; then
|
||||
echo "$1 is not a vaild IP address!"
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
while getopts "dfmlt:p:" options; do
|
||||
case $options in
|
||||
d ) debug=true;;
|
||||
f ) force="1";;
|
||||
t ) transport=$OPTARG;;
|
||||
p ) port=$OPTARG;;
|
||||
m ) startup_manual="1";;
|
||||
l ) log_out=0;;
|
||||
\? ) usage
|
||||
exit 1;;
|
||||
* ) usage
|
||||
exit 1;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
discover()
|
||||
{
|
||||
# If open-iscsi is already logged in to the portal, exit
|
||||
if [ $(iscsiadm -m session | grep -c ${ip}:${port}) -ne 0 ]; then
|
||||
echo "Please logout from all targets on ${ip}:${port} before trying to run discovery on that portal"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
connected=0
|
||||
discovered=0
|
||||
|
||||
dbg "starting discovery to $ip"
|
||||
disc="$(iscsiadm -m discovery --type sendtargets --portal ${ip}:${port})"
|
||||
echo "${disc}" | while read portal target
|
||||
do
|
||||
portal=${portal%,*}
|
||||
select_transport
|
||||
done
|
||||
|
||||
discovered=$(echo "${disc}" | wc -l)
|
||||
if [ ${discovered} = 0 ]; then
|
||||
echo "failed to discover targets at ${ip}"
|
||||
exit 2
|
||||
else
|
||||
echo "discovered ${discovered} targets at ${ip}"
|
||||
fi
|
||||
}
|
||||
|
||||
try_login()
|
||||
{
|
||||
if [ "$startup_manual" != "1" ]; then
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} --op update -n node.conn[0].startup -v automatic
|
||||
fi
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} --login >/dev/null 2>&1
|
||||
ret=$?
|
||||
if [ ${ret} = 0 ]; then
|
||||
echo "Set target ${target} to automatic login over ${transport} to portal ${portal}"
|
||||
((connected++))
|
||||
if [ "$log_out" = "1" ]; then
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} --logout
|
||||
fi
|
||||
else
|
||||
echo "Cannot login over ${transport} to portal ${portal}"
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} --op update -n node.conn[0].startup -v manual
|
||||
fi
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
set_transport()
|
||||
{
|
||||
transport=$1
|
||||
case "$transport" in
|
||||
iser)
|
||||
# iSER does not use digest
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} \
|
||||
--op update -n node.conn[0].iscsi.HeaderDigest -v None
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} \
|
||||
--op update -n node.conn[0].iscsi.DataDigest -v None
|
||||
;;
|
||||
cxgb3i)
|
||||
# cxgb3i supports <= 16K packet (BHS + AHS + pdu payload + digests)
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} \
|
||||
--op update -n node.conn[0].iscsi.MaxRecvDataSegmentLength \
|
||||
-v 8192
|
||||
;;
|
||||
esac
|
||||
transport_name=`iscsiadm -m node -p ${portal} -T ${target} |awk '/transport_name/ {print $1}'`
|
||||
iscsiadm -m node --targetname ${target} --portal ${portal} \
|
||||
--op update -n ${transport_name} -v ${transport}
|
||||
}
|
||||
|
||||
select_transport()
|
||||
{
|
||||
set_transport $transport
|
||||
dbg "Testing $transport-login to target ${target} portal ${portal}"
|
||||
try_login;
|
||||
if [ $? != 0 -a "$force" = "0" ]; then
|
||||
set_transport tcp
|
||||
dbg "starting to test tcp-login to target ${target} portal ${portal}"
|
||||
try_login;
|
||||
fi
|
||||
}
|
||||
|
||||
check_iscsid()
|
||||
{
|
||||
#check if iscsid is running
|
||||
pidof iscsid &>/dev/null
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
echo "iscsid is not running"
|
||||
echo "Exiting..."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_iscsid
|
||||
initialize
|
||||
parse_cmdline "$@"
|
||||
discover
|
||||
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsiadm
Executable file
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsiadm
Executable file
Binary file not shown.
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsid
Executable file
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsid
Executable file
Binary file not shown.
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsistart
Executable file
BIN
overlays/rpi4-armbian-longhorn-root/usr/sbin/iscsistart
Executable file
Binary file not shown.
@ -1,89 +1,96 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"fmt"
|
||||
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
// NodeConfig represents boot-time configuration to inject.
|
||||
type NodeConfig struct {
|
||||
Hostname string `json:"hostname"`
|
||||
IP string `json:"ip"`
|
||||
K3s K3sConfig `json:"k3s"`
|
||||
SSHUser string `json:"ssh_user,omitempty"`
|
||||
SSHKeys []string `json:"ssh_keys,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Taints []string `json:"taints,omitempty"`
|
||||
Fstab []FstabEntry `json:"fstab,omitempty"`
|
||||
Hostname string `json:"hostname"`
|
||||
IP string `json:"ip"`
|
||||
K3s K3sConfig `json:"k3s"`
|
||||
SSHUser string `json:"ssh_user,omitempty"`
|
||||
SSHKeys []string `json:"ssh_keys,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Taints []string `json:"taints,omitempty"`
|
||||
Fstab []FstabEntry `json:"fstab,omitempty"`
|
||||
Secrets map[string]string `json:"secrets,omitempty"` // optional key/values for local agent use
|
||||
}
|
||||
|
||||
// K3sConfig includes role and token/url.
|
||||
type K3sConfig struct {
|
||||
Role string `json:"role"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Token string `json:"token,omitempty"`
|
||||
Args []string `json:"args,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Taints []string `json:"taints,omitempty"`
|
||||
Role string `json:"role"`
|
||||
Version string `json:"version,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Token string `json:"token,omitempty"`
|
||||
Args []string `json:"args,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Taints []string `json:"taints,omitempty"`
|
||||
}
|
||||
|
||||
// FstabEntry for Longhorn or other mounts.
|
||||
type FstabEntry struct {
|
||||
UUID string `json:"uuid"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
FS string `json:"fs"`
|
||||
Options string `json:"options"`
|
||||
UUID string `json:"uuid"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
FS string `json:"fs"`
|
||||
Options string `json:"options"`
|
||||
}
|
||||
|
||||
// Build creates a NodeConfig from inventory.
|
||||
func Build(inv *inventory.Inventory, nodeName string) (*NodeConfig, error) {
|
||||
n, cls, err := inv.FindNode(nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
labels := map[string]string{}
|
||||
for k, v := range cls.DefaultLabels {
|
||||
labels[k] = v
|
||||
}
|
||||
for k, v := range n.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
taints := append([]string{}, cls.DefaultTaints...)
|
||||
taints = append(taints, n.Taints...)
|
||||
n, cls, err := inv.FindNode(nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
labels := map[string]string{}
|
||||
for k, v := range cls.DefaultLabels {
|
||||
labels[k] = v
|
||||
}
|
||||
for k, v := range n.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
taints := append([]string{}, cls.DefaultTaints...)
|
||||
taints = append(taints, n.Taints...)
|
||||
k3sVersion := cls.K3sVersion
|
||||
if n.K3sVersion != "" {
|
||||
k3sVersion = n.K3sVersion
|
||||
}
|
||||
|
||||
fstab := []FstabEntry{}
|
||||
for _, d := range n.LonghornDisks {
|
||||
fs := d.FS
|
||||
if fs == "" {
|
||||
fs = "ext4"
|
||||
}
|
||||
fstab = append(fstab, FstabEntry{
|
||||
UUID: d.UUID,
|
||||
Mountpoint: d.Mountpoint,
|
||||
FS: fs,
|
||||
Options: "defaults,nofail",
|
||||
})
|
||||
}
|
||||
fstab := []FstabEntry{}
|
||||
for _, d := range n.LonghornDisks {
|
||||
fs := d.FS
|
||||
if fs == "" {
|
||||
fs = "ext4"
|
||||
}
|
||||
fstab = append(fstab, FstabEntry{
|
||||
UUID: d.UUID,
|
||||
Mountpoint: d.Mountpoint,
|
||||
FS: fs,
|
||||
Options: "defaults,nofail",
|
||||
})
|
||||
}
|
||||
|
||||
cfg := &NodeConfig{
|
||||
Hostname: n.Hostname,
|
||||
IP: n.IP,
|
||||
SSHUser: n.SSHUser,
|
||||
SSHKeys: n.SSHAuthorized,
|
||||
Labels: labels,
|
||||
Taints: taints,
|
||||
Fstab: fstab,
|
||||
K3s: K3sConfig{
|
||||
Role: n.K3sRole,
|
||||
URL: n.K3sURL,
|
||||
Token: n.K3sToken,
|
||||
Labels: labels,
|
||||
Taints: taints,
|
||||
},
|
||||
}
|
||||
if cfg.Hostname == "" || cfg.IP == "" {
|
||||
return nil, fmt.Errorf("hostname/ip required for node %s", nodeName)
|
||||
}
|
||||
return cfg, nil
|
||||
cfg := &NodeConfig{
|
||||
Hostname: n.Hostname,
|
||||
IP: n.IP,
|
||||
SSHUser: n.SSHUser,
|
||||
SSHKeys: n.SSHAuthorized,
|
||||
Labels: labels,
|
||||
Taints: taints,
|
||||
Fstab: fstab,
|
||||
K3s: K3sConfig{
|
||||
Role: n.K3sRole,
|
||||
Version: k3sVersion,
|
||||
URL: n.K3sURL,
|
||||
Token: n.K3sToken,
|
||||
Labels: labels,
|
||||
Taints: taints,
|
||||
},
|
||||
}
|
||||
if cfg.Hostname == "" || cfg.IP == "" {
|
||||
return nil, fmt.Errorf("hostname/ip required for node %s", nodeName)
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
63
pkg/facts/aggregate.go
Normal file
63
pkg/facts/aggregate.go
Normal file
@ -0,0 +1,63 @@
|
||||
package facts
|
||||
|
||||
import (
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
// ClassSummary captures aggregated sentinel facts per class.
|
||||
type ClassSummary struct {
|
||||
Class string `json:"class"`
|
||||
Nodes []string `json:"nodes"`
|
||||
Kernels map[string]int `json:"kernels,omitempty"`
|
||||
OSImages map[string]int `json:"os_images,omitempty"`
|
||||
Containerd map[string]int `json:"containerd,omitempty"`
|
||||
K3sVersions map[string]int `json:"k3s_versions,omitempty"`
|
||||
PackageStats map[string]map[string]int `json:"package_stats,omitempty"` // pkg -> version -> count
|
||||
}
|
||||
|
||||
// Aggregate groups snapshots by inventory class and tallies version drift.
|
||||
func Aggregate(inv *inventory.Inventory, snaps []Snapshot) map[string]*ClassSummary {
|
||||
result := map[string]*ClassSummary{}
|
||||
for _, s := range snaps {
|
||||
class := "unknown"
|
||||
if inv != nil {
|
||||
if node, cls, err := inv.FindNode(s.Hostname); err == nil && cls != nil && node != nil {
|
||||
class = cls.Name
|
||||
}
|
||||
}
|
||||
sum, ok := result[class]
|
||||
if !ok {
|
||||
sum = &ClassSummary{
|
||||
Class: class,
|
||||
Kernels: map[string]int{},
|
||||
OSImages: map[string]int{},
|
||||
Containerd: map[string]int{},
|
||||
K3sVersions: map[string]int{},
|
||||
PackageStats: map[string]map[string]int{},
|
||||
}
|
||||
result[class] = sum
|
||||
}
|
||||
sum.Nodes = append(sum.Nodes, s.Hostname)
|
||||
if s.Kernel != "" {
|
||||
sum.Kernels[s.Kernel]++
|
||||
}
|
||||
if s.OSImage != "" {
|
||||
sum.OSImages[s.OSImage]++
|
||||
}
|
||||
if s.Containerd != "" {
|
||||
sum.Containerd[s.Containerd]++
|
||||
}
|
||||
if s.K3sVersion != "" {
|
||||
sum.K3sVersions[s.K3sVersion]++
|
||||
}
|
||||
for pkg, ver := range s.PackageSample {
|
||||
if sum.PackageStats[pkg] == nil {
|
||||
sum.PackageStats[pkg] = map[string]int{}
|
||||
}
|
||||
if ver != "" {
|
||||
sum.PackageStats[pkg][ver]++
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
33
pkg/facts/aggregate_test.go
Normal file
33
pkg/facts/aggregate_test.go
Normal file
@ -0,0 +1,33 @@
|
||||
package facts
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
func TestAggregateGroupsByClass(t *testing.T) {
|
||||
inv := &inventory.Inventory{
|
||||
Classes: []inventory.NodeClass{{Name: "c1"}, {Name: "c2"}},
|
||||
Nodes: []inventory.NodeSpec{
|
||||
{Name: "n1", Class: "c1"},
|
||||
{Name: "n2", Class: "c2"},
|
||||
},
|
||||
}
|
||||
snaps := []Snapshot{
|
||||
{Hostname: "n1", Kernel: "k1", PackageSample: map[string]string{"containerd": "2.0"}},
|
||||
{Hostname: "n2", Kernel: "k2", PackageSample: map[string]string{"containerd": "1.7"}},
|
||||
{Hostname: "n1", Kernel: "k1"},
|
||||
}
|
||||
sum := Aggregate(inv, snaps)
|
||||
if len(sum) != 2 {
|
||||
t.Fatalf("expected 2 classes, got %d", len(sum))
|
||||
}
|
||||
c1 := sum["c1"]
|
||||
if c1 == nil || c1.Kernels["k1"] != 2 {
|
||||
t.Fatalf("expected k1 count 2, got %#v", c1)
|
||||
}
|
||||
if c1.PackageStats["containerd"]["2.0"] != 1 {
|
||||
t.Fatalf("package stats not tallied: %#v", c1.PackageStats)
|
||||
}
|
||||
}
|
||||
43
pkg/facts/load.go
Normal file
43
pkg/facts/load.go
Normal file
@ -0,0 +1,43 @@
|
||||
package facts
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// Snapshot mirrors sentinel output; kept minimal to avoid tight coupling.
|
||||
type Snapshot struct {
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
Kernel string `json:"kernel,omitempty"`
|
||||
OSImage string `json:"os_image,omitempty"`
|
||||
K3sVersion string `json:"k3s_version,omitempty"`
|
||||
Containerd string `json:"containerd,omitempty"`
|
||||
PackageSample map[string]string `json:"package_sample,omitempty"`
|
||||
DropInsSample map[string]string `json:"dropins_sample,omitempty"`
|
||||
}
|
||||
|
||||
// LoadDir reads all *.json under a directory and returns snapshots.
|
||||
func LoadDir(dir string) ([]Snapshot, error) {
|
||||
var snaps []Snapshot
|
||||
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() || filepath.Ext(path) != ".json" {
|
||||
return nil
|
||||
}
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var s Snapshot
|
||||
if err := json.Unmarshal(b, &s); err != nil {
|
||||
return err
|
||||
}
|
||||
snaps = append(snaps, s)
|
||||
return nil
|
||||
})
|
||||
return snaps, err
|
||||
}
|
||||
22
pkg/facts/load_test.go
Normal file
22
pkg/facts/load_test.go
Normal file
@ -0,0 +1,22 @@
|
||||
package facts
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadDirReadsSnapshots(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
snap := `{"hostname":"n1","kernel":"k","containerd":"c","package_sample":{"a":"1"}}`
|
||||
if err := os.WriteFile(filepath.Join(dir, "snap.json"), []byte(snap), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := LoadDir(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadDir: %v", err)
|
||||
}
|
||||
if len(got) != 1 || got[0].Hostname != "n1" || got[0].PackageSample["a"] != "1" {
|
||||
t.Fatalf("unexpected snapshot: %+v", got)
|
||||
}
|
||||
}
|
||||
13
pkg/facts/recommend.go
Normal file
13
pkg/facts/recommend.go
Normal file
@ -0,0 +1,13 @@
|
||||
package facts
|
||||
|
||||
import "metis/pkg/inventory"
|
||||
|
||||
// RecommendTargets builds per-class targets from snapshots.
|
||||
func RecommendTargets(inv *inventory.Inventory, snaps []Snapshot) map[string]Targets {
|
||||
sum := Aggregate(inv, snaps)
|
||||
out := map[string]Targets{}
|
||||
for cls, s := range sum {
|
||||
out[cls] = ChooseTargets(s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
37
pkg/facts/recommend_test.go
Normal file
37
pkg/facts/recommend_test.go
Normal file
@ -0,0 +1,37 @@
|
||||
package facts
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
func TestRecommendTargetsPerClass(t *testing.T) {
|
||||
inv := &inventory.Inventory{
|
||||
Classes: []inventory.NodeClass{{Name: "c1"}, {Name: "c2"}},
|
||||
Nodes: []inventory.NodeSpec{
|
||||
{Name: "n1", Class: "c1"},
|
||||
{Name: "n2", Class: "c2"},
|
||||
},
|
||||
}
|
||||
snaps := []Snapshot{
|
||||
{Hostname: "n1", Kernel: "k1", Containerd: "2.0", PackageSample: map[string]string{"containerd": "2.0"}},
|
||||
{Hostname: "n2", Kernel: "k2", Containerd: "1.7", PackageSample: map[string]string{"containerd": "1.7"}},
|
||||
}
|
||||
targets := RecommendTargets(inv, snaps)
|
||||
if targets["c1"].Kernel != "k1" || targets["c1"].Containerd != "2.0" {
|
||||
t.Fatalf("unexpected targets for c1: %+v", targets["c1"])
|
||||
}
|
||||
if targets["c2"].Kernel != "k2" || targets["c2"].Packages["containerd"] != "1.7" {
|
||||
t.Fatalf("unexpected targets for c2: %+v", targets["c2"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecommendHandlesUnknownClass(t *testing.T) {
|
||||
inv := &inventory.Inventory{}
|
||||
snaps := []Snapshot{{Hostname: "ghost", Kernel: "k"}}
|
||||
targets := RecommendTargets(inv, snaps)
|
||||
if _, ok := targets["unknown"]; !ok {
|
||||
t.Fatalf("expected unknown class entry")
|
||||
}
|
||||
}
|
||||
43
pkg/facts/targets.go
Normal file
43
pkg/facts/targets.go
Normal file
@ -0,0 +1,43 @@
|
||||
package facts
|
||||
|
||||
// Targets proposes normalized targets from a ClassSummary by picking the most common version.
|
||||
type Targets struct {
|
||||
Kernel string
|
||||
OSImage string
|
||||
Containerd string
|
||||
K3sVersion string
|
||||
Packages map[string]string
|
||||
}
|
||||
|
||||
// ChooseTargets picks the highest-count entry for each field. Ties are left empty.
|
||||
func ChooseTargets(sum *ClassSummary) Targets {
|
||||
t := Targets{Packages: map[string]string{}}
|
||||
if sum == nil {
|
||||
return t
|
||||
}
|
||||
t.Kernel = topKey(sum.Kernels)
|
||||
t.OSImage = topKey(sum.OSImages)
|
||||
t.Containerd = topKey(sum.Containerd)
|
||||
t.K3sVersion = topKey(sum.K3sVersions)
|
||||
for pkg, versions := range sum.PackageStats {
|
||||
if v := topKey(versions); v != "" {
|
||||
t.Packages[pkg] = v
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
func topKey(m map[string]int) string {
|
||||
best := ""
|
||||
bestCount := 0
|
||||
for k, c := range m {
|
||||
if c > bestCount {
|
||||
best = k
|
||||
bestCount = c
|
||||
} else if c == bestCount {
|
||||
// tie: prefer empty to avoid arbitrary pick
|
||||
best = ""
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
26
pkg/facts/targets_test.go
Normal file
26
pkg/facts/targets_test.go
Normal file
@ -0,0 +1,26 @@
|
||||
package facts
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestChooseTargetsPicksMostCommon(t *testing.T) {
|
||||
sum := &ClassSummary{
|
||||
Kernels: map[string]int{"k1": 2, "k2": 1},
|
||||
OSImages: map[string]int{"os1": 1},
|
||||
Containerd: map[string]int{"c1": 2, "c2": 2}, // tie -> empty
|
||||
K3sVersions: map[string]int{"k3s1": 3},
|
||||
PackageStats: map[string]map[string]int{
|
||||
"containerd": {"1.7": 1, "2.0": 2},
|
||||
"k3s": {"v1": 1},
|
||||
},
|
||||
}
|
||||
tg := ChooseTargets(sum)
|
||||
if tg.Kernel != "k1" || tg.OSImage != "os1" || tg.K3sVersion != "k3s1" {
|
||||
t.Fatalf("unexpected targets: %+v", tg)
|
||||
}
|
||||
if tg.Containerd != "" {
|
||||
t.Fatalf("expected tie -> empty for containerd, got %q", tg.Containerd)
|
||||
}
|
||||
if tg.Packages["containerd"] != "2.0" {
|
||||
t.Fatalf("package target wrong: %+v", tg.Packages)
|
||||
}
|
||||
}
|
||||
21
pkg/facts/types.go
Normal file
21
pkg/facts/types.go
Normal file
@ -0,0 +1,21 @@
|
||||
package facts
|
||||
|
||||
// ClassFacts captures driftable state collected by metis-sentinel.
|
||||
type ClassFacts struct {
|
||||
ClassName string `json:"class_name"`
|
||||
Kernel string `json:"kernel,omitempty"`
|
||||
K3sVersion string `json:"k3s_version,omitempty"`
|
||||
Containerd string `json:"containerd,omitempty"`
|
||||
Packages map[string]string `json:"packages,omitempty"` // name -> version
|
||||
DropIns map[string]string `json:"dropins,omitempty"` // path -> content
|
||||
Sysctl map[string]string `json:"sysctl,omitempty"` // key -> value
|
||||
CGroupConfig map[string]string `json:"cgroup_config,omitempty"`// key -> value
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// NodeFacts captures per-node data (e.g., disk UUIDs) to verify drift.
|
||||
type NodeFacts struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Disks map[string]string `json:"disks,omitempty"` // mount -> UUID
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
@ -20,6 +21,18 @@ func Download(url, dest string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.HasSuffix(url, ".xz") {
|
||||
tmp := dest + ".download.xz"
|
||||
if err := downloadRaw(url, tmp); err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.Remove(tmp)
|
||||
return decompressXZ(tmp, dest)
|
||||
}
|
||||
return downloadRaw(url, dest)
|
||||
}
|
||||
|
||||
func downloadRaw(url, dest string) error {
|
||||
if strings.HasPrefix(url, "file://") {
|
||||
src := strings.TrimPrefix(url, "file://")
|
||||
in, err := os.Open(src)
|
||||
@ -52,6 +65,22 @@ func Download(url, dest string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func decompressXZ(src, dest string) error {
|
||||
out, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
cmd := exec.Command("xz", "-dc", src)
|
||||
cmd.Stdout = out
|
||||
var stderr strings.Builder
|
||||
cmd.Stderr = &stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("xz decompress %s: %w: %s", src, err, stderr.String())
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
// VerifyChecksum checks sha256 in the form "sha256:<hex>".
|
||||
func VerifyChecksum(path, checksum string) error {
|
||||
if checksum == "" {
|
||||
|
||||
34
pkg/image/download_test.go
Normal file
34
pkg/image/download_test.go
Normal file
@ -0,0 +1,34 @@
|
||||
package image
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDownloadDecompressesXZFileURLs(t *testing.T) {
|
||||
if _, err := exec.LookPath("xz"); err != nil {
|
||||
t.Skip("xz not available")
|
||||
}
|
||||
dir := t.TempDir()
|
||||
raw := filepath.Join(dir, "base.img")
|
||||
if err := os.WriteFile(raw, []byte("metis-xz-test"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
compressed := raw + ".xz"
|
||||
cmd := exec.Command("xz", "-zk", raw)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
t.Fatalf("xz: %v: %s", err, string(out))
|
||||
}
|
||||
dest := filepath.Join(dir, "copy.img")
|
||||
if err := Download("file://"+compressed, dest); err != nil {
|
||||
t.Fatalf("Download: %v", err)
|
||||
}
|
||||
sum := sha256.Sum256([]byte("metis-xz-test"))
|
||||
if err := VerifyChecksum(dest, "sha256:"+hex.EncodeToString(sum[:])); err != nil {
|
||||
t.Fatalf("VerifyChecksum: %v", err)
|
||||
}
|
||||
}
|
||||
257
pkg/image/rootfs.go
Normal file
257
pkg/image/rootfs.go
Normal file
@ -0,0 +1,257 @@
|
||||
package image
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"metis/pkg/inject"
|
||||
)
|
||||
|
||||
type partitionTable struct {
|
||||
PartitionTable partitionTableData `json:"partitiontable"`
|
||||
}
|
||||
|
||||
type partitionTableData struct {
|
||||
SectorSize uint64 `json:"sectorsize"`
|
||||
Partitions []partitionTablePart `json:"partitions"`
|
||||
}
|
||||
|
||||
type partitionTablePart struct {
|
||||
Start uint64 `json:"start"`
|
||||
Size uint64 `json:"size"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// InjectRootFS rewrites the Linux root partition inside a raw image file without
|
||||
// requiring block-device mounts. Only rootfs-targeted files are written.
|
||||
func InjectRootFS(imagePath string, files []inject.FileSpec) error {
|
||||
rootFiles := make([]inject.FileSpec, 0, len(files))
|
||||
for _, f := range files {
|
||||
if f.RootFS {
|
||||
rootFiles = append(rootFiles, f)
|
||||
}
|
||||
}
|
||||
if len(rootFiles) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
part, sectorSize, err := findLinuxPartition(imagePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
workDir, err := os.MkdirTemp("", "metis-rootfs-")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(workDir)
|
||||
|
||||
rootImage := filepath.Join(workDir, "root.ext4")
|
||||
if err := extractPartition(imagePath, rootImage, part, sectorSize); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeExt4Files(rootImage, rootFiles); err != nil {
|
||||
return err
|
||||
}
|
||||
return replacePartition(imagePath, rootImage, part, sectorSize)
|
||||
}
|
||||
|
||||
func findLinuxPartition(imagePath string) (partitionTablePart, uint64, error) {
|
||||
out, err := exec.Command("sfdisk", "-J", imagePath).Output()
|
||||
if err != nil {
|
||||
return partitionTablePart{}, 0, fmt.Errorf("sfdisk -J %s: %w", imagePath, err)
|
||||
}
|
||||
var table partitionTable
|
||||
if err := json.Unmarshal(out, &table); err != nil {
|
||||
return partitionTablePart{}, 0, fmt.Errorf("decode partition table: %w", err)
|
||||
}
|
||||
sectorSize := table.PartitionTable.SectorSize
|
||||
if sectorSize == 0 {
|
||||
sectorSize = 512
|
||||
}
|
||||
for i := len(table.PartitionTable.Partitions) - 1; i >= 0; i-- {
|
||||
part := table.PartitionTable.Partitions[i]
|
||||
if isLinuxPartitionType(part.Type) {
|
||||
return part, sectorSize, nil
|
||||
}
|
||||
}
|
||||
return partitionTablePart{}, 0, fmt.Errorf("no Linux root partition found in %s", imagePath)
|
||||
}
|
||||
|
||||
func isLinuxPartitionType(partType string) bool {
|
||||
normalized := strings.ToLower(strings.TrimSpace(partType))
|
||||
switch normalized {
|
||||
case "83", "8300":
|
||||
return true
|
||||
}
|
||||
return normalized == "0fc63daf-8483-4772-8e79-3d69d8477de4"
|
||||
}
|
||||
|
||||
func extractPartition(imagePath, outPath string, part partitionTablePart, sectorSize uint64) error {
|
||||
sizeBytes := int64(part.Size * sectorSize)
|
||||
offsetBytes := int64(part.Start * sectorSize)
|
||||
|
||||
src, err := os.Open(imagePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer src.Close()
|
||||
if _, err := src.Seek(offsetBytes, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
out, err := os.Create(outPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
if _, err := io.CopyN(out, src, sizeBytes); err != nil {
|
||||
return fmt.Errorf("extract root partition: %w", err)
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func replacePartition(imagePath, rootImage string, part partitionTablePart, sectorSize uint64) error {
|
||||
expectedSize := int64(part.Size * sectorSize)
|
||||
info, err := os.Stat(rootImage)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.Size() != expectedSize {
|
||||
return fmt.Errorf("root partition size mismatch: expected %d got %d", expectedSize, info.Size())
|
||||
}
|
||||
|
||||
in, err := os.Open(rootImage)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.OpenFile(imagePath, os.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
if _, err := out.Seek(int64(part.Start*sectorSize), io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(out, in); err != nil {
|
||||
return fmt.Errorf("write root partition: %w", err)
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func writeExt4Files(fsPath string, files []inject.FileSpec) error {
|
||||
workDir, err := os.MkdirTemp("", "metis-ext4-")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(workDir)
|
||||
|
||||
stageDir := filepath.Join(workDir, "stage")
|
||||
commandFile := filepath.Join(workDir, "commands.txt")
|
||||
|
||||
dirs := map[string]struct{}{}
|
||||
commands := make([]string, 0, len(files)*4)
|
||||
|
||||
for _, f := range files {
|
||||
localPath := filepath.Join(stageDir, filepath.FromSlash(f.Path))
|
||||
if err := os.MkdirAll(filepath.Dir(localPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.WriteFile(localPath, f.Content, 0o644); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, dir := range parentDirs(f.Path) {
|
||||
dirs[dir] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
dirList := make([]string, 0, len(dirs))
|
||||
for dir := range dirs {
|
||||
dirList = append(dirList, dir)
|
||||
}
|
||||
sort.Slice(dirList, func(i, j int) bool {
|
||||
leftDepth := strings.Count(dirList[i], "/")
|
||||
rightDepth := strings.Count(dirList[j], "/")
|
||||
if leftDepth != rightDepth {
|
||||
return leftDepth < rightDepth
|
||||
}
|
||||
return dirList[i] < dirList[j]
|
||||
})
|
||||
for _, dir := range dirList {
|
||||
commands = append(commands, fmt.Sprintf("mkdir %s", dir))
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
destPath := "/" + strings.TrimPrefix(filepath.ToSlash(f.Path), "/")
|
||||
localPath := filepath.Join(stageDir, filepath.FromSlash(f.Path))
|
||||
commands = append(commands, fmt.Sprintf("rm %s", destPath))
|
||||
commands = append(commands, fmt.Sprintf("write %s %s", localPath, destPath))
|
||||
commands = append(commands, fmt.Sprintf("sif %s mode 0%o", destPath, uint32(0o100000|f.Mode.Perm())))
|
||||
}
|
||||
if err := os.WriteFile(commandFile, []byte(strings.Join(commands, "\n")+"\n"), 0o644); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cmd := exec.Command("debugfs", "-w", "-f", commandFile, fsPath)
|
||||
var combined bytes.Buffer
|
||||
cmd.Stdout = &combined
|
||||
cmd.Stderr = &combined
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("debugfs write failed: %w: %s", err, combined.String())
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
if err := verifyExt4File(fsPath, f, workDir); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyExt4File(fsPath string, file inject.FileSpec, workDir string) error {
|
||||
destPath := "/" + strings.TrimPrefix(filepath.ToSlash(file.Path), "/")
|
||||
statOut, err := exec.Command("debugfs", "-R", "stat "+destPath, fsPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("verify %s: %w: %s", destPath, err, string(statOut))
|
||||
}
|
||||
expectedMode := fmt.Sprintf("Mode: %04o", file.Mode.Perm())
|
||||
if !strings.Contains(string(statOut), expectedMode) {
|
||||
return fmt.Errorf("verify %s mode: expected %s in %s", destPath, expectedMode, string(statOut))
|
||||
}
|
||||
|
||||
readback := filepath.Join(workDir, strings.TrimPrefix(filepath.FromSlash(file.Path), string(filepath.Separator))+".readback")
|
||||
if err := os.MkdirAll(filepath.Dir(readback), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
dumpOut, err := exec.Command("debugfs", "-R", fmt.Sprintf("dump %s %s", destPath, readback), fsPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("dump %s: %w: %s", destPath, err, string(dumpOut))
|
||||
}
|
||||
got, err := os.ReadFile(readback)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !bytes.Equal(got, file.Content) {
|
||||
return fmt.Errorf("verify %s content mismatch", destPath)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func parentDirs(path string) []string {
|
||||
cleaned := "/" + strings.TrimPrefix(filepath.ToSlash(path), "/")
|
||||
parts := strings.Split(cleaned, "/")
|
||||
var dirs []string
|
||||
for i := 2; i < len(parts); i++ {
|
||||
dirs = append(dirs, strings.Join(parts[:i], "/"))
|
||||
}
|
||||
return dirs
|
||||
}
|
||||
68
pkg/image/rootfs_test.go
Normal file
68
pkg/image/rootfs_test.go
Normal file
@ -0,0 +1,68 @@
|
||||
package image
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"metis/pkg/inject"
|
||||
)
|
||||
|
||||
func TestWriteExt4Files(t *testing.T) {
|
||||
if _, err := exec.LookPath("mkfs.ext4"); err != nil {
|
||||
t.Skip("mkfs.ext4 not available")
|
||||
}
|
||||
if _, err := exec.LookPath("debugfs"); err != nil {
|
||||
t.Skip("debugfs not available")
|
||||
}
|
||||
|
||||
workDir := t.TempDir()
|
||||
fsPath := filepath.Join(workDir, "root.ext4")
|
||||
f, err := os.Create(fsPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := f.Truncate(32 * 1024 * 1024); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cmd := exec.Command("mkfs.ext4", "-F", fsPath)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
t.Fatalf("mkfs.ext4: %v: %s", err, string(out))
|
||||
}
|
||||
|
||||
files := []inject.FileSpec{
|
||||
{
|
||||
Path: "etc/metis/firstboot.env",
|
||||
Content: []byte("METIS_HOSTNAME='titan-13'\n"),
|
||||
Mode: 0o600,
|
||||
RootFS: true,
|
||||
},
|
||||
{
|
||||
Path: "usr/local/sbin/test.sh",
|
||||
Content: []byte("#!/usr/bin/env bash\nexit 0\n"),
|
||||
Mode: 0o755,
|
||||
RootFS: true,
|
||||
},
|
||||
}
|
||||
if err := writeExt4Files(fsPath, files); err != nil {
|
||||
t.Fatalf("writeExt4Files: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParentDirs(t *testing.T) {
|
||||
got := parentDirs("etc/metis/firstboot.env")
|
||||
want := []string{"/etc", "/etc/metis"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("parentDirs length mismatch: got %v want %v", got, want)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("parentDirs[%d] = %q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -20,10 +20,13 @@ type NodeClass struct {
|
||||
OS string `yaml:"os"`
|
||||
Image string `yaml:"image"`
|
||||
Checksum string `yaml:"checksum,omitempty"`
|
||||
K3sVersion string `yaml:"k3s_version,omitempty"`
|
||||
BootloaderNote string `yaml:"bootloader_note,omitempty"`
|
||||
DefaultLabels map[string]string `yaml:"default_labels,omitempty"`
|
||||
DefaultTaints []string `yaml:"default_taints,omitempty"`
|
||||
CloudInit string `yaml:"cloud_init,omitempty"`
|
||||
BootOverlay string `yaml:"boot_overlay,omitempty"` // path to overlay files for boot partition
|
||||
RootOverlay string `yaml:"root_overlay,omitempty"` // path to overlay files for rootfs
|
||||
}
|
||||
|
||||
// NodeSpec captures per-node overrides and identity.
|
||||
@ -34,6 +37,7 @@ type NodeSpec struct {
|
||||
IP string `yaml:"ip"`
|
||||
MAC string `yaml:"mac,omitempty"`
|
||||
K3sRole string `yaml:"k3s_role"`
|
||||
K3sVersion string `yaml:"k3s_version,omitempty"`
|
||||
K3sToken string `yaml:"k3s_token,omitempty"`
|
||||
K3sURL string `yaml:"k3s_url,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
@ -61,9 +65,58 @@ func Load(path string) (*Inventory, error) {
|
||||
if err := yaml.Unmarshal(data, &inv); err != nil {
|
||||
return nil, fmt.Errorf("parse inventory: %w", err)
|
||||
}
|
||||
expandInventory(&inv)
|
||||
return &inv, nil
|
||||
}
|
||||
|
||||
func expandInventory(inv *Inventory) {
|
||||
for idx := range inv.Classes {
|
||||
inv.Classes[idx].Name = os.ExpandEnv(inv.Classes[idx].Name)
|
||||
inv.Classes[idx].Arch = os.ExpandEnv(inv.Classes[idx].Arch)
|
||||
inv.Classes[idx].OS = os.ExpandEnv(inv.Classes[idx].OS)
|
||||
inv.Classes[idx].Image = os.ExpandEnv(inv.Classes[idx].Image)
|
||||
inv.Classes[idx].Checksum = os.ExpandEnv(inv.Classes[idx].Checksum)
|
||||
inv.Classes[idx].K3sVersion = os.ExpandEnv(inv.Classes[idx].K3sVersion)
|
||||
inv.Classes[idx].BootloaderNote = os.ExpandEnv(inv.Classes[idx].BootloaderNote)
|
||||
inv.Classes[idx].CloudInit = os.ExpandEnv(inv.Classes[idx].CloudInit)
|
||||
inv.Classes[idx].BootOverlay = os.ExpandEnv(inv.Classes[idx].BootOverlay)
|
||||
inv.Classes[idx].RootOverlay = os.ExpandEnv(inv.Classes[idx].RootOverlay)
|
||||
for key, value := range inv.Classes[idx].DefaultLabels {
|
||||
inv.Classes[idx].DefaultLabels[key] = os.ExpandEnv(value)
|
||||
}
|
||||
for taintIdx, value := range inv.Classes[idx].DefaultTaints {
|
||||
inv.Classes[idx].DefaultTaints[taintIdx] = os.ExpandEnv(value)
|
||||
}
|
||||
}
|
||||
for idx := range inv.Nodes {
|
||||
inv.Nodes[idx].Name = os.ExpandEnv(inv.Nodes[idx].Name)
|
||||
inv.Nodes[idx].Class = os.ExpandEnv(inv.Nodes[idx].Class)
|
||||
inv.Nodes[idx].Hostname = os.ExpandEnv(inv.Nodes[idx].Hostname)
|
||||
inv.Nodes[idx].IP = os.ExpandEnv(inv.Nodes[idx].IP)
|
||||
inv.Nodes[idx].MAC = os.ExpandEnv(inv.Nodes[idx].MAC)
|
||||
inv.Nodes[idx].K3sRole = os.ExpandEnv(inv.Nodes[idx].K3sRole)
|
||||
inv.Nodes[idx].K3sVersion = os.ExpandEnv(inv.Nodes[idx].K3sVersion)
|
||||
inv.Nodes[idx].K3sToken = os.ExpandEnv(inv.Nodes[idx].K3sToken)
|
||||
inv.Nodes[idx].K3sURL = os.ExpandEnv(inv.Nodes[idx].K3sURL)
|
||||
inv.Nodes[idx].SSHUser = os.ExpandEnv(inv.Nodes[idx].SSHUser)
|
||||
inv.Nodes[idx].Notes = os.ExpandEnv(inv.Nodes[idx].Notes)
|
||||
for key, value := range inv.Nodes[idx].Labels {
|
||||
inv.Nodes[idx].Labels[key] = os.ExpandEnv(value)
|
||||
}
|
||||
for taintIdx, value := range inv.Nodes[idx].Taints {
|
||||
inv.Nodes[idx].Taints[taintIdx] = os.ExpandEnv(value)
|
||||
}
|
||||
for keyIdx, value := range inv.Nodes[idx].SSHAuthorized {
|
||||
inv.Nodes[idx].SSHAuthorized[keyIdx] = os.ExpandEnv(value)
|
||||
}
|
||||
for diskIdx := range inv.Nodes[idx].LonghornDisks {
|
||||
inv.Nodes[idx].LonghornDisks[diskIdx].Mountpoint = os.ExpandEnv(inv.Nodes[idx].LonghornDisks[diskIdx].Mountpoint)
|
||||
inv.Nodes[idx].LonghornDisks[diskIdx].UUID = os.ExpandEnv(inv.Nodes[idx].LonghornDisks[diskIdx].UUID)
|
||||
inv.Nodes[idx].LonghornDisks[diskIdx].FS = os.ExpandEnv(inv.Nodes[idx].LonghornDisks[diskIdx].FS)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FindNode returns the node spec and class.
|
||||
func (i *Inventory) FindNode(name string) (*NodeSpec, *NodeClass, error) {
|
||||
var node *NodeSpec
|
||||
|
||||
43
pkg/inventory/types_test.go
Normal file
43
pkg/inventory/types_test.go
Normal file
@ -0,0 +1,43 @@
|
||||
package inventory
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadExpandsEnvironmentVariables(t *testing.T) {
|
||||
t.Setenv("METIS_IMAGE_PATH", "file:///tmp/rpi4.img")
|
||||
t.Setenv("METIS_K3S_TOKEN", "secret-token")
|
||||
invPath := filepath.Join(t.TempDir(), "inventory.yaml")
|
||||
if err := os.WriteFile(invPath, []byte(`
|
||||
classes:
|
||||
- name: rpi4
|
||||
image: ${METIS_IMAGE_PATH}
|
||||
k3s_version: v1.31.5+k3s1
|
||||
nodes:
|
||||
- name: titan-13
|
||||
class: rpi4
|
||||
hostname: titan-13
|
||||
ip: 192.168.22.41
|
||||
k3s_role: agent
|
||||
k3s_token: ${METIS_K3S_TOKEN}
|
||||
`), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
inv, err := Load(invPath)
|
||||
if err != nil {
|
||||
t.Fatalf("Load: %v", err)
|
||||
}
|
||||
node, class, err := inv.FindNode("titan-13")
|
||||
if err != nil {
|
||||
t.Fatalf("FindNode: %v", err)
|
||||
}
|
||||
if class.Image != "file:///tmp/rpi4.img" {
|
||||
t.Fatalf("image not expanded: %q", class.Image)
|
||||
}
|
||||
if node.K3sToken != "secret-token" {
|
||||
t.Fatalf("token not expanded: %q", node.K3sToken)
|
||||
}
|
||||
}
|
||||
87
pkg/mount/mount.go
Normal file
87
pkg/mount/mount.go
Normal file
@ -0,0 +1,87 @@
|
||||
package mount
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"metis/pkg/util"
|
||||
)
|
||||
|
||||
// LoopMount describes a mounted image with boot/root paths.
|
||||
type LoopMount struct {
|
||||
LoopDevice string // only set when losetup created it
|
||||
BootPath string
|
||||
RootPath string
|
||||
}
|
||||
|
||||
// Setup attaches an image as a loop device with partitions (-P) OR mounts an existing /dev path
|
||||
// by assuming p1=boot, p2=root. Intended for Linux hosts only.
|
||||
func Setup(path string) (*LoopMount, error) {
|
||||
device := path
|
||||
loopDevice := ""
|
||||
if !strings.HasPrefix(path, "/dev/") {
|
||||
var err error
|
||||
device, err = createLoop(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
loopDevice = device
|
||||
}
|
||||
bootDir, err := os.MkdirTemp("", "metis-boot-")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rootDir, err := os.MkdirTemp("", "metis-root-")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Assume p1=boot, p2=root (Raspberry Pi style images)
|
||||
if err := util.Run("mount", partitionPath(device, 1), bootDir); err != nil {
|
||||
_ = Teardown(&LoopMount{LoopDevice: loopDevice, BootPath: bootDir, RootPath: rootDir})
|
||||
return nil, fmt.Errorf("mount boot: %w", err)
|
||||
}
|
||||
if err := util.Run("mount", partitionPath(device, 2), rootDir); err != nil {
|
||||
_ = util.Run("umount", bootDir)
|
||||
_ = Teardown(&LoopMount{LoopDevice: loopDevice, BootPath: bootDir, RootPath: rootDir})
|
||||
return nil, fmt.Errorf("mount root: %w", err)
|
||||
}
|
||||
return &LoopMount{LoopDevice: loopDevice, BootPath: bootDir, RootPath: rootDir}, nil
|
||||
}
|
||||
|
||||
// Teardown unmounts and detaches the loop device.
|
||||
func Teardown(m *LoopMount) error {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
if m.BootPath != "" {
|
||||
_ = util.Run("umount", m.BootPath)
|
||||
_ = os.RemoveAll(m.BootPath)
|
||||
}
|
||||
if m.RootPath != "" {
|
||||
_ = util.Run("umount", m.RootPath)
|
||||
_ = os.RemoveAll(m.RootPath)
|
||||
}
|
||||
if m.LoopDevice != "" {
|
||||
_ = util.Run("losetup", "-d", m.LoopDevice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func partitionPath(base string, idx int) string {
|
||||
p := fmt.Sprintf("%sp%d", base, idx)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
return p
|
||||
}
|
||||
return fmt.Sprintf("%s%d", base, idx)
|
||||
}
|
||||
|
||||
func createLoop(imagePath string) (string, error) {
|
||||
// losetup -Pf --show <image>
|
||||
out, err := util.RunLogged("losetup", "-Pf", "--show", filepath.Clean(imagePath))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(out), nil
|
||||
}
|
||||
@ -1,12 +1,15 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"metis/pkg/image"
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/mount"
|
||||
"metis/pkg/writer"
|
||||
)
|
||||
|
||||
// Execute performs a burn if confirm is true. With confirm=false, it only downloads/verifies and returns the plan.
|
||||
@ -28,12 +31,18 @@ func Execute(inv *inventory.Inventory, nodeName, device, cacheDir string, confir
|
||||
if device == "" || device == "/dev/sdX" {
|
||||
return p, fmt.Errorf("refusing to write to placeholder device")
|
||||
}
|
||||
ddCmd := []string{"dd", fmt.Sprintf("if=%s", cacheImage), fmt.Sprintf("of=%s", device), "bs=4M", "status=progress", "conv=fsync"}
|
||||
cmd := exec.Command(ddCmd[0], ddCmd[1:]...)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := cmd.Run(); err != nil {
|
||||
return p, fmt.Errorf("dd failed: %w", err)
|
||||
ctx := context.Background()
|
||||
if err := writer.WriteImage(ctx, cacheImage, device); err != nil {
|
||||
return p, fmt.Errorf("write image: %w", err)
|
||||
}
|
||||
if err := maybeInject(inv, nodeName); err != nil {
|
||||
return p, fmt.Errorf("inject config: %w", err)
|
||||
}
|
||||
if auto := maybeAutoMount(device); auto != nil {
|
||||
defer mount.Teardown(auto)
|
||||
if err := maybeInject(inv, nodeName); err != nil {
|
||||
return p, fmt.Errorf("inject (auto-mount): %w", err)
|
||||
}
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
@ -45,3 +54,22 @@ func checksumFromInventory(inv *inventory.Inventory, node string) string {
|
||||
}
|
||||
return cls.Checksum
|
||||
}
|
||||
|
||||
func maybeAutoMount(device string) *mount.LoopMount {
|
||||
if os.Getenv("METIS_AUTO_MOUNT") == "" {
|
||||
return nil
|
||||
}
|
||||
// Use mount helper against the written device partitions.
|
||||
m, err := mount.Setup(device)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
// Propagate mount paths for injection.
|
||||
if m.BootPath != "" {
|
||||
_ = os.Setenv("METIS_BOOT_PATH", m.BootPath)
|
||||
}
|
||||
if m.RootPath != "" {
|
||||
_ = os.Setenv("METIS_ROOT_PATH", m.RootPath)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
43
pkg/plan/image_build.go
Normal file
43
pkg/plan/image_build.go
Normal file
@ -0,0 +1,43 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
|
||||
"metis/pkg/image"
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/writer"
|
||||
)
|
||||
|
||||
// BuildImageFile materializes a fully injected raw image for a node.
|
||||
func BuildImageFile(ctx context.Context, inv *inventory.Inventory, nodeName, cacheDir, output string) error {
|
||||
p, err := Build(inv, nodeName, output, cacheDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build plan: %w", err)
|
||||
}
|
||||
_, class, err := inv.FindNode(nodeName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load node class: %w", err)
|
||||
}
|
||||
|
||||
cacheImage := filepath.Join(cacheDir, filepath.Base(p.Image))
|
||||
if err := image.Download(p.Image, cacheImage); err != nil {
|
||||
return fmt.Errorf("download image: %w", err)
|
||||
}
|
||||
if err := image.VerifyChecksum(cacheImage, class.Checksum); err != nil {
|
||||
return fmt.Errorf("verify checksum: %w", err)
|
||||
}
|
||||
if err := writer.WriteImage(ctx, cacheImage, output); err != nil {
|
||||
return fmt.Errorf("copy base image: %w", err)
|
||||
}
|
||||
|
||||
files, err := Files(inv, nodeName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve files: %w", err)
|
||||
}
|
||||
if err := image.InjectRootFS(output, files); err != nil {
|
||||
return fmt.Errorf("inject rootfs: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
393
pkg/plan/inject.go
Normal file
393
pkg/plan/inject.go
Normal file
@ -0,0 +1,393 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"metis/pkg/config"
|
||||
"metis/pkg/inject"
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/secrets"
|
||||
)
|
||||
|
||||
// maybeInject writes node-specific config into mounted boot/root paths if the env
|
||||
// vars METIS_BOOT_PATH or METIS_ROOT_PATH are set. When unset, injection is skipped.
|
||||
func maybeInject(inv *inventory.Inventory, nodeName string) error {
|
||||
boot := os.Getenv("METIS_BOOT_PATH")
|
||||
root := os.Getenv("METIS_ROOT_PATH")
|
||||
if boot == "" && root == "" {
|
||||
return nil
|
||||
}
|
||||
files, err := Files(inv, nodeName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
filtered := make([]inject.FileSpec, 0, len(files))
|
||||
for _, f := range files {
|
||||
if f.RootFS && root == "" {
|
||||
continue
|
||||
}
|
||||
if !f.RootFS && boot == "" {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, f)
|
||||
}
|
||||
if len(filtered) == 0 {
|
||||
return nil
|
||||
}
|
||||
inj := inject.Injector{BootPath: boot, RootPath: root}
|
||||
return inj.Write(filtered)
|
||||
}
|
||||
|
||||
// Files resolves the full set of node-specific files, including overlays.
|
||||
func Files(inv *inventory.Inventory, nodeName string) ([]inject.FileSpec, error) {
|
||||
node, class, err := inv.FindNode(nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cfg, err := config.Build(inv, nodeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sec := fetchSecrets(node.Hostname)
|
||||
if sec != nil {
|
||||
if sec.K3sToken != "" {
|
||||
cfg.K3s.Token = sec.K3sToken
|
||||
}
|
||||
if len(sec.Extra) > 0 {
|
||||
cfg.Secrets = sec.Extra
|
||||
}
|
||||
}
|
||||
files, err := buildFiles(cfg, sec)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
overlayFiles, err := collectOverlays(class)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, overlayFiles...)
|
||||
_ = node // reserved for future per-node overlays
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// Inject writes node-specific config into caller-supplied boot/root mountpoints.
|
||||
func Inject(inv *inventory.Inventory, nodeName, boot, root string) error {
|
||||
oldBoot := os.Getenv("METIS_BOOT_PATH")
|
||||
oldRoot := os.Getenv("METIS_ROOT_PATH")
|
||||
defer func() {
|
||||
if oldBoot == "" {
|
||||
_ = os.Unsetenv("METIS_BOOT_PATH")
|
||||
} else {
|
||||
_ = os.Setenv("METIS_BOOT_PATH", oldBoot)
|
||||
}
|
||||
if oldRoot == "" {
|
||||
_ = os.Unsetenv("METIS_ROOT_PATH")
|
||||
} else {
|
||||
_ = os.Setenv("METIS_ROOT_PATH", oldRoot)
|
||||
}
|
||||
}()
|
||||
if boot == "" {
|
||||
_ = os.Unsetenv("METIS_BOOT_PATH")
|
||||
} else {
|
||||
_ = os.Setenv("METIS_BOOT_PATH", boot)
|
||||
}
|
||||
if root == "" {
|
||||
_ = os.Unsetenv("METIS_ROOT_PATH")
|
||||
} else {
|
||||
_ = os.Setenv("METIS_ROOT_PATH", root)
|
||||
}
|
||||
return maybeInject(inv, nodeName)
|
||||
}
|
||||
|
||||
func buildFiles(cfg *config.NodeConfig, sec *secrets.NodeSecrets) ([]inject.FileSpec, error) {
|
||||
files := []inject.FileSpec{
|
||||
{Path: "etc/hostname", Content: []byte(cfg.Hostname + "\n"), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/hosts", Content: []byte(hostsContent(cfg.Hostname)), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/rancher/k3s/config.yaml", Content: []byte(k3sConfigContent(cfg)), Mode: 0o644, RootFS: true},
|
||||
{Path: "etc/metis/firstboot.env", Content: []byte(firstbootEnvContent(cfg)), Mode: 0o600, RootFS: true},
|
||||
}
|
||||
if cfg.IP != "" {
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/NetworkManager/system-connections/end0-static.nmconnection",
|
||||
Content: []byte(networkManagerConnectionContent(cfg.IP)),
|
||||
Mode: 0o600,
|
||||
RootFS: true,
|
||||
})
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/systemd/network/10-end0-static.network",
|
||||
Content: []byte(systemdNetworkContent(cfg.IP)),
|
||||
Mode: 0o644,
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
if len(cfg.SSHKeys) > 0 && cfg.SSHUser != "" {
|
||||
auth := strings.Join(cfg.SSHKeys, "\n") + "\n"
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: fmt.Sprintf("home/%s/.ssh/authorized_keys", cfg.SSHUser),
|
||||
Content: []byte(auth),
|
||||
Mode: 0o600,
|
||||
RootFS: true,
|
||||
})
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/metis/authorized_keys",
|
||||
Content: []byte(auth),
|
||||
Mode: 0o600,
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
if len(cfg.Fstab) > 0 {
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/metis/fstab.append",
|
||||
Content: []byte(fstabAppendContent(cfg)),
|
||||
Mode: 0o644,
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
|
||||
// Store the raw config for debugging/ops.
|
||||
raw, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/metis/node.json",
|
||||
Content: raw,
|
||||
Mode: 0o644,
|
||||
RootFS: true,
|
||||
})
|
||||
if sec != nil {
|
||||
secRaw, err := json.MarshalIndent(sec, "", " ")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "etc/metis/secrets.json",
|
||||
Content: secRaw,
|
||||
Mode: 0o600,
|
||||
RootFS: true,
|
||||
})
|
||||
}
|
||||
|
||||
// Optional cloud-init for images that honor NoCloud.
|
||||
userData := cloudInitUserData(cfg, sec)
|
||||
if userData != "" {
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "user-data",
|
||||
Content: []byte(userData),
|
||||
Mode: 0o644,
|
||||
RootFS: false,
|
||||
})
|
||||
files = append(files, inject.FileSpec{
|
||||
Path: "meta-data",
|
||||
Content: []byte(fmt.Sprintf("instance-id: %s\nlocal-hostname: %s\n", cfg.Hostname, cfg.Hostname)),
|
||||
Mode: 0o644,
|
||||
RootFS: false,
|
||||
})
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func hostsContent(hostname string) string {
|
||||
return fmt.Sprintf("127.0.0.1\tlocalhost\n127.0.1.1\t%s\n\n# Injected by metis\n", hostname)
|
||||
}
|
||||
|
||||
func k3sConfigContent(cfg *config.NodeConfig) string {
|
||||
var labelList []string
|
||||
for k, v := range cfg.Labels {
|
||||
labelList = append(labelList, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
sort.Strings(labelList)
|
||||
taints := append([]string{}, cfg.Taints...)
|
||||
sort.Strings(taints)
|
||||
|
||||
var b bytes.Buffer
|
||||
b.WriteString("write-kubeconfig-mode: \"0644\"\n")
|
||||
if cfg.K3s.URL != "" {
|
||||
b.WriteString(fmt.Sprintf("server: %s\n", cfg.K3s.URL))
|
||||
}
|
||||
if cfg.K3s.Token != "" {
|
||||
b.WriteString(fmt.Sprintf("token: %s\n", cfg.K3s.Token))
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("node-name: %s\n", cfg.Hostname))
|
||||
if cfg.IP != "" {
|
||||
b.WriteString(fmt.Sprintf("node-ip: %s\n", cfg.IP))
|
||||
}
|
||||
if len(labelList) > 0 {
|
||||
b.WriteString("node-label:\n")
|
||||
for _, l := range labelList {
|
||||
b.WriteString(fmt.Sprintf(" - %s\n", l))
|
||||
}
|
||||
}
|
||||
if len(taints) > 0 {
|
||||
b.WriteString("node-taint:\n")
|
||||
for _, t := range taints {
|
||||
b.WriteString(fmt.Sprintf(" - %s\n", t))
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func cloudInitUserData(cfg *config.NodeConfig, sec *secrets.NodeSecrets) string {
|
||||
if cfg == nil {
|
||||
return ""
|
||||
}
|
||||
if sec != nil && sec.CloudInit != "" {
|
||||
return sec.CloudInit
|
||||
}
|
||||
var b bytes.Buffer
|
||||
b.WriteString("#cloud-config\n")
|
||||
b.WriteString(fmt.Sprintf("hostname: %s\n", cfg.Hostname))
|
||||
if len(cfg.SSHKeys) > 0 {
|
||||
b.WriteString("ssh_authorized_keys:\n")
|
||||
for _, k := range cfg.SSHKeys {
|
||||
b.WriteString(fmt.Sprintf(" - %s\n", k))
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func firstbootEnvContent(cfg *config.NodeConfig) string {
|
||||
var b bytes.Buffer
|
||||
b.WriteString(fmt.Sprintf("METIS_HOSTNAME=%s\n", shellQuote(cfg.Hostname)))
|
||||
b.WriteString(fmt.Sprintf("METIS_SSH_USER=%s\n", shellQuote(cfg.SSHUser)))
|
||||
b.WriteString(fmt.Sprintf("METIS_K3S_VERSION=%s\n", shellQuote(cfg.K3s.Version)))
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func networkManagerConnectionContent(ip string) string {
|
||||
gateway := ip
|
||||
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
|
||||
gateway = gateway[:lastDot+1] + "1"
|
||||
}
|
||||
return fmt.Sprintf(`[connection]
|
||||
id=end0-static
|
||||
type=ethernet
|
||||
interface-name=end0
|
||||
autoconnect=true
|
||||
autoconnect-priority=100
|
||||
|
||||
[ethernet]
|
||||
|
||||
[ipv4]
|
||||
method=manual
|
||||
address1=%s/24,%s
|
||||
dns=%s;
|
||||
dns-search=titan;
|
||||
may-fail=false
|
||||
|
||||
[ipv6]
|
||||
method=ignore
|
||||
|
||||
[proxy]
|
||||
`, ip, gateway, gateway)
|
||||
}
|
||||
|
||||
func systemdNetworkContent(ip string) string {
|
||||
gateway := ip
|
||||
if lastDot := strings.LastIndex(gateway, "."); lastDot >= 0 {
|
||||
gateway = gateway[:lastDot+1] + "1"
|
||||
}
|
||||
return fmt.Sprintf(`[Match]
|
||||
Name=end0
|
||||
|
||||
[Network]
|
||||
Address=%s/24
|
||||
Gateway=%s
|
||||
DNS=%s
|
||||
Domains=titan
|
||||
DHCP=no
|
||||
IPv6AcceptRA=no
|
||||
LinkLocalAddressing=no
|
||||
`, ip, gateway, gateway)
|
||||
}
|
||||
|
||||
func fstabAppendContent(cfg *config.NodeConfig) string {
|
||||
var lines []string
|
||||
for _, entry := range cfg.Fstab {
|
||||
lines = append(lines, fmt.Sprintf(
|
||||
"UUID=%s %s %s %s 0 0",
|
||||
entry.UUID,
|
||||
entry.Mountpoint,
|
||||
entry.FS,
|
||||
entry.Options,
|
||||
))
|
||||
}
|
||||
sort.Strings(lines)
|
||||
return strings.Join(lines, "\n") + "\n"
|
||||
}
|
||||
|
||||
func shellQuote(value string) string {
|
||||
if value == "" {
|
||||
return "''"
|
||||
}
|
||||
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
|
||||
}
|
||||
|
||||
func fetchSecrets(hostname string) *secrets.NodeSecrets {
|
||||
if os.Getenv("VAULT_ADDR") == "" {
|
||||
return nil
|
||||
}
|
||||
cli := secrets.NewFromEnv()
|
||||
sec, err := cli.FetchNode(context.Background(), hostname)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return sec
|
||||
}
|
||||
|
||||
func collectOverlays(class *inventory.NodeClass) ([]inject.FileSpec, error) {
|
||||
var files []inject.FileSpec
|
||||
if class == nil {
|
||||
return files, nil
|
||||
}
|
||||
if class.BootOverlay != "" {
|
||||
more, err := overlayFiles(class.BootOverlay, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, more...)
|
||||
}
|
||||
if class.RootOverlay != "" {
|
||||
more, err := overlayFiles(class.RootOverlay, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, more...)
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func overlayFiles(dir string, rootfs bool) ([]inject.FileSpec, error) {
|
||||
var specs []inject.FileSpec
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(dir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
specs = append(specs, inject.FileSpec{
|
||||
Path: rel,
|
||||
Content: content,
|
||||
Mode: info.Mode(),
|
||||
RootFS: rootfs,
|
||||
})
|
||||
return nil
|
||||
})
|
||||
return specs, err
|
||||
}
|
||||
125
pkg/plan/inject_test.go
Normal file
125
pkg/plan/inject_test.go
Normal file
@ -0,0 +1,125 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"metis/pkg/config"
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/secrets"
|
||||
)
|
||||
|
||||
func TestBuildFilesProducesK3sConfig(t *testing.T) {
|
||||
cfg := &config.NodeConfig{
|
||||
Hostname: "n1",
|
||||
IP: "10.0.0.10",
|
||||
SSHUser: "pi",
|
||||
SSHKeys: []string{"ssh-rsa AAA"},
|
||||
Fstab: []config.FstabEntry{
|
||||
{
|
||||
UUID: "disk-uuid",
|
||||
Mountpoint: "/mnt/astreae",
|
||||
FS: "ext4",
|
||||
Options: "defaults,nofail",
|
||||
},
|
||||
},
|
||||
Labels: map[string]string{"role": "worker", "zone": "a"},
|
||||
Taints: []string{"gpu=true:NoSchedule"},
|
||||
K3s: config.K3sConfig{
|
||||
URL: "https://server:6443",
|
||||
Token: "secret",
|
||||
Version: "v1.31.5+k3s1",
|
||||
},
|
||||
}
|
||||
files, err := buildFiles(cfg, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("buildFiles: %v", err)
|
||||
}
|
||||
pathMap := map[string]string{}
|
||||
for _, f := range files {
|
||||
pathMap[f.Path] = string(f.Content)
|
||||
}
|
||||
k3s, ok := pathMap["etc/rancher/k3s/config.yaml"]
|
||||
if !ok {
|
||||
t.Fatalf("missing k3s config")
|
||||
}
|
||||
if !strings.Contains(k3s, "server: https://server:6443") || !strings.Contains(k3s, "node-name: n1") {
|
||||
t.Fatalf("unexpected k3s config: %s", k3s)
|
||||
}
|
||||
hostFile, ok := pathMap["etc/hostname"]
|
||||
if !ok || strings.TrimSpace(hostFile) != "n1" {
|
||||
t.Fatalf("hostname file missing/incorrect: %q", hostFile)
|
||||
}
|
||||
auth, ok := pathMap["home/pi/.ssh/authorized_keys"]
|
||||
if !ok || !strings.Contains(auth, "ssh-rsa AAA") {
|
||||
t.Fatalf("authorized_keys missing/incorrect: %s", auth)
|
||||
}
|
||||
firstboot, ok := pathMap["etc/metis/firstboot.env"]
|
||||
if !ok || !strings.Contains(firstboot, "METIS_K3S_VERSION='v1.31.5+k3s1'") {
|
||||
t.Fatalf("firstboot env missing/incorrect: %s", firstboot)
|
||||
}
|
||||
network, ok := pathMap["etc/NetworkManager/system-connections/end0-static.nmconnection"]
|
||||
if !ok || !strings.Contains(network, "address1=10.0.0.10/24,10.0.0.1") {
|
||||
t.Fatalf("networkmanager config missing/incorrect: %s", network)
|
||||
}
|
||||
networkd, ok := pathMap["etc/systemd/network/10-end0-static.network"]
|
||||
if !ok || !strings.Contains(networkd, "Address=10.0.0.10/24") || !strings.Contains(networkd, "Gateway=10.0.0.1") {
|
||||
t.Fatalf("systemd-networkd config missing/incorrect: %s", networkd)
|
||||
}
|
||||
fstab, ok := pathMap["etc/metis/fstab.append"]
|
||||
if !ok || !strings.Contains(fstab, "UUID=disk-uuid /mnt/astreae ext4 defaults,nofail 0 0") {
|
||||
t.Fatalf("fstab append missing/incorrect: %s", fstab)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOverlayFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
bootDir := filepath.Join(dir, "boot")
|
||||
rootDir := filepath.Join(dir, "root")
|
||||
if err := os.MkdirAll(filepath.Join(bootDir, "over"), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(rootDir, "etc"), 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(bootDir, "over", "cmdline.txt"), []byte("console=tty1"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(rootDir, "etc", "issue"), []byte("hello"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
class := &inventory.NodeClass{
|
||||
BootOverlay: bootDir,
|
||||
RootOverlay: rootDir,
|
||||
}
|
||||
files, err := collectOverlays(class)
|
||||
if err != nil {
|
||||
t.Fatalf("collectOverlays: %v", err)
|
||||
}
|
||||
if len(files) != 2 {
|
||||
t.Fatalf("expected 2 files, got %d", len(files))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSecretsWrite(t *testing.T) {
|
||||
cfg := &config.NodeConfig{
|
||||
Hostname: "n1",
|
||||
IP: "10.0.0.1",
|
||||
}
|
||||
sec := &secrets.NodeSecrets{K3sToken: "tok", SSHPassword: "pw", Extra: map[string]string{"foo": "bar"}}
|
||||
files, err := buildFiles(cfg, sec)
|
||||
if err != nil {
|
||||
t.Fatalf("buildFiles: %v", err)
|
||||
}
|
||||
found := false
|
||||
for _, f := range files {
|
||||
if f.Path == "etc/metis/secrets.json" && f.RootFS {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("secrets file not written")
|
||||
}
|
||||
}
|
||||
@ -2,6 +2,7 @@ package plan
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
@ -41,7 +42,15 @@ func Build(inv *inventory.Inventory, nodeName, device, cacheDir string) (*Plan,
|
||||
actions = append(actions, Action{Type: "verify", Detail: fmt.Sprintf("Verify checksum %s", class.Checksum)})
|
||||
}
|
||||
actions = append(actions, Action{Type: "write", Detail: fmt.Sprintf("Write image to %s", device), Command: fmt.Sprintf("dd if=%s of=%s bs=4M status=progress conv=fsync", cacheImage, device)})
|
||||
actions = append(actions, Action{Type: "inject", Detail: "Inject hostname/network/k3s config into boot or rootfs"})
|
||||
if boot := os.Getenv("METIS_BOOT_PATH"); boot != "" {
|
||||
actions = append(actions, Action{Type: "inject", Detail: fmt.Sprintf("Inject config into boot mount %s", boot)})
|
||||
}
|
||||
if root := os.Getenv("METIS_ROOT_PATH"); root != "" {
|
||||
actions = append(actions, Action{Type: "inject", Detail: fmt.Sprintf("Inject config into root mount %s", root)})
|
||||
}
|
||||
if os.Getenv("METIS_BOOT_PATH") == "" && os.Getenv("METIS_ROOT_PATH") == "" {
|
||||
actions = append(actions, Action{Type: "inject", Detail: "Inject hostname/network/k3s config (requires mounted boot/root; skipped if unset)"})
|
||||
}
|
||||
actions = append(actions, Action{Type: "finalize", Detail: fmt.Sprintf("Ready to insert SD for %s", node.Hostname)})
|
||||
|
||||
return &Plan{
|
||||
|
||||
39
pkg/plan/plan_env_test.go
Normal file
39
pkg/plan/plan_env_test.go
Normal file
@ -0,0 +1,39 @@
|
||||
package plan
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"metis/pkg/inventory"
|
||||
)
|
||||
|
||||
func TestBuildIncludesInjectWhenEnvSet(t *testing.T) {
|
||||
defer os.Unsetenv("METIS_BOOT_PATH")
|
||||
os.Setenv("METIS_BOOT_PATH", "/mnt/boot")
|
||||
inv := &inventory.Inventory{
|
||||
Classes: []inventory.NodeClass{{
|
||||
Name: "c1",
|
||||
Image: "file:///tmp/dummy",
|
||||
}},
|
||||
Nodes: []inventory.NodeSpec{{
|
||||
Name: "n1",
|
||||
Class: "c1",
|
||||
Hostname: "n1",
|
||||
IP: "10.0.0.1",
|
||||
K3sRole: "agent",
|
||||
}},
|
||||
}
|
||||
p, err := Build(inv, "n1", "/dev/sdz", "/tmp/cache")
|
||||
if err != nil {
|
||||
t.Fatalf("build: %v", err)
|
||||
}
|
||||
found := false
|
||||
for _, a := range p.Actions {
|
||||
if a.Type == "inject" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("expected inject action when METIS_BOOT_PATH set")
|
||||
}
|
||||
}
|
||||
125
pkg/secrets/vault.go
Normal file
125
pkg/secrets/vault.go
Normal file
@ -0,0 +1,125 @@
|
||||
package secrets
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NodeSecrets holds per-node secret material to inject at burn time.
|
||||
// These should live in Vault at secret/data/nodes/<hostname>.
|
||||
type NodeSecrets struct {
|
||||
SSHPassword string `json:"ssh_password,omitempty"`
|
||||
K3sToken string `json:"k3s_token,omitempty"`
|
||||
CloudInit string `json:"cloud_init,omitempty"`
|
||||
Extra map[string]string `json:"extra,omitempty"`
|
||||
}
|
||||
|
||||
// Client fetches node secrets from Vault using either a token or AppRole.
|
||||
type Client struct {
|
||||
Addr string
|
||||
Token string
|
||||
RoleID string
|
||||
SecretID string
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
// NewFromEnv builds a client from VAULT_ADDR, VAULT_TOKEN, VAULT_ROLE_ID, VAULT_SECRET_ID.
|
||||
func NewFromEnv() *Client {
|
||||
return &Client{
|
||||
Addr: os.Getenv("VAULT_ADDR"),
|
||||
Token: os.Getenv("VAULT_TOKEN"),
|
||||
RoleID: os.Getenv("VAULT_ROLE_ID"),
|
||||
SecretID: os.Getenv("VAULT_SECRET_ID"),
|
||||
Client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// LoginIfNeeded performs AppRole login if no token is present.
|
||||
func (c *Client) LoginIfNeeded(ctx context.Context) error {
|
||||
if c.Token != "" || c.RoleID == "" || c.SecretID == "" {
|
||||
return nil
|
||||
}
|
||||
body := map[string]string{"role_id": c.RoleID, "secret_id": c.SecretID}
|
||||
var buf bytes.Buffer
|
||||
if err := json.NewEncoder(&buf).Encode(body); err != nil {
|
||||
return err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/v1/auth/approle/login", strings.TrimSuffix(c.Addr, "/")), &buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
resp, err := c.httpClient().Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("approle login failed: %s", resp.Status)
|
||||
}
|
||||
var r struct {
|
||||
Auth struct {
|
||||
ClientToken string `json:"client_token"`
|
||||
} `json:"auth"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
|
||||
return err
|
||||
}
|
||||
if r.Auth.ClientToken == "" {
|
||||
return fmt.Errorf("approle login returned empty token")
|
||||
}
|
||||
c.Token = r.Auth.ClientToken
|
||||
return nil
|
||||
}
|
||||
|
||||
// FetchNode pulls secret/data/nodes/<hostname>.
|
||||
func (c *Client) FetchNode(ctx context.Context, hostname string) (*NodeSecrets, error) {
|
||||
if err := c.LoginIfNeeded(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
url := fmt.Sprintf("%s/v1/secret/data/nodes/%s", strings.TrimSuffix(c.Addr, "/"), hostname)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if c.Token != "" {
|
||||
req.Header.Set("X-Vault-Token", c.Token)
|
||||
}
|
||||
resp, err := c.httpClient().Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return &NodeSecrets{}, nil
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("vault fetch %s: %s: %s", hostname, resp.Status, string(b))
|
||||
}
|
||||
var r struct {
|
||||
Data struct {
|
||||
Data NodeSecrets `json:"data"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &r.Data.Data, nil
|
||||
}
|
||||
|
||||
func (c *Client) httpClient() *http.Client {
|
||||
if c.Client != nil {
|
||||
return c.Client
|
||||
}
|
||||
return http.DefaultClient
|
||||
}
|
||||
76
pkg/secrets/vault_test.go
Normal file
76
pkg/secrets/vault_test.go
Normal file
@ -0,0 +1,76 @@
|
||||
package secrets
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFetchNodeReturnsData(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/v1/secret/data/nodes/n1":
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"data": map[string]any{
|
||||
"data": map[string]any{
|
||||
"ssh_password": "p1",
|
||||
"k3s_token": "t1",
|
||||
"cloud_init": "ci",
|
||||
},
|
||||
},
|
||||
})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := &Client{Addr: srv.URL, Token: "tok"}
|
||||
sec, err := c.FetchNode(context.Background(), "n1")
|
||||
if err != nil {
|
||||
t.Fatalf("fetch: %v", err)
|
||||
}
|
||||
if sec.SSHPassword != "p1" || sec.K3sToken != "t1" || sec.CloudInit != "ci" {
|
||||
t.Fatalf("unexpected secrets: %+v", sec)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApproRoleLogin(t *testing.T) {
|
||||
loginCalled := false
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/v1/auth/approle/login":
|
||||
loginCalled = true
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"auth": map[string]any{
|
||||
"client_token": "newtoken",
|
||||
},
|
||||
})
|
||||
case "/v1/secret/data/nodes/n1":
|
||||
if r.Header.Get("X-Vault-Token") != "newtoken" {
|
||||
t.Fatalf("missing token after approle login")
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"data": map[string]any{
|
||||
"data": map[string]any{},
|
||||
},
|
||||
})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := &Client{Addr: srv.URL, RoleID: "r", SecretID: "s", Client: srv.Client()}
|
||||
if _, err := c.FetchNode(context.Background(), "n1"); err != nil {
|
||||
t.Fatalf("fetch with approle: %v", err)
|
||||
}
|
||||
if !loginCalled {
|
||||
t.Fatalf("approle login not called")
|
||||
}
|
||||
}
|
||||
88
pkg/sentinel/collector.go
Normal file
88
pkg/sentinel/collector.go
Normal file
@ -0,0 +1,88 @@
|
||||
package sentinel
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Snapshot captures host-level facts.
|
||||
type Snapshot struct {
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
Kernel string `json:"kernel,omitempty"`
|
||||
OSImage string `json:"os_image,omitempty"`
|
||||
K3sVersion string `json:"k3s_version,omitempty"`
|
||||
Containerd string `json:"containerd,omitempty"`
|
||||
PackageSample map[string]string `json:"package_sample,omitempty"` // small subset to detect drift
|
||||
DropInsSample map[string]string `json:"dropins_sample,omitempty"` // path->content hash/sample
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// Collect gathers a minimal set of facts; intended to run inside a DaemonSet pod with host mounts.
|
||||
func Collect() *Snapshot {
|
||||
return &Snapshot{
|
||||
Hostname: runAndTrim("hostname"),
|
||||
Kernel: runAndTrim("uname", "-r"),
|
||||
OSImage: osRelease(),
|
||||
K3sVersion: runAndTrim("k3s", "version"),
|
||||
Containerd: runAndTrim("containerd", "--version"),
|
||||
PackageSample: pkgSample(),
|
||||
}
|
||||
}
|
||||
|
||||
func runAndTrim(cmd string, args ...string) string {
|
||||
out, err := commandOutput(cmd, args...)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func osRelease() string {
|
||||
out, err := commandOutput("cat", "/etc/os-release")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.HasPrefix(line, "PRETTY_NAME=") {
|
||||
return strings.Trim(line[len("PRETTY_NAME="):], "\"")
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// pkgSample grabs a tiny subset of package versions to detect drift without collecting everything.
|
||||
func pkgSample() map[string]string {
|
||||
names := []string{"containerd", "k3s", "nvidia-container-toolkit", "linux-image-raspi"}
|
||||
result := map[string]string{}
|
||||
for _, n := range names {
|
||||
v := pkgVersion(n)
|
||||
if v != "" {
|
||||
result[n] = v
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func pkgVersion(name string) string {
|
||||
// Try dpkg-query first.
|
||||
out, err := commandOutput("dpkg-query", "-W", "-f", "${Version}", name)
|
||||
if err == nil && len(out) > 0 {
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
// Fallback rpm.
|
||||
out, err = commandOutput("rpm", "-q", "--qf", "%{VERSION}-%{RELEASE}", name)
|
||||
if err == nil && len(out) > 0 {
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func commandOutput(cmd string, args ...string) ([]byte, error) {
|
||||
if os.Getenv("METIS_SENTINEL_NSENTER") == "1" {
|
||||
nsenterArgs := []string{"-t", "1", "-m", "-u", "-n", "-i", "-p", "--", cmd}
|
||||
nsenterArgs = append(nsenterArgs, args...)
|
||||
return exec.Command("nsenter", nsenterArgs...).Output()
|
||||
}
|
||||
return exec.Command(cmd, args...).Output()
|
||||
}
|
||||
795
pkg/service/app.go
Normal file
795
pkg/service/app.go
Normal file
@ -0,0 +1,795 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"metis/pkg/facts"
|
||||
"metis/pkg/image"
|
||||
"metis/pkg/inventory"
|
||||
"metis/pkg/plan"
|
||||
"metis/pkg/sentinel"
|
||||
"metis/pkg/writer"
|
||||
)
|
||||
|
||||
type JobStatus string
|
||||
|
||||
const (
|
||||
JobQueued JobStatus = "queued"
|
||||
JobRunning JobStatus = "running"
|
||||
JobDone JobStatus = "done"
|
||||
JobError JobStatus = "error"
|
||||
)
|
||||
|
||||
// Device describes a flashable block device.
|
||||
type Device struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Transport string `json:"transport,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Removable bool `json:"removable"`
|
||||
Hotplug bool `json:"hotplug"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
// Job is a long-running Metis action visible in the UI.
|
||||
type Job struct {
|
||||
ID string `json:"id"`
|
||||
Kind string `json:"kind"`
|
||||
Node string `json:"node,omitempty"`
|
||||
Host string `json:"host,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
Status JobStatus `json:"status"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
Artifact string `json:"artifact,omitempty"`
|
||||
ProgressPct float64 `json:"progress_pct"`
|
||||
Written int64 `json:"written_bytes,omitempty"`
|
||||
Total int64 `json:"total_bytes,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
FinishedAt time.Time `json:"finished_at,omitempty"`
|
||||
}
|
||||
|
||||
// Event is a user-facing activity item for recent changes and runs.
|
||||
type Event struct {
|
||||
Time time.Time `json:"time"`
|
||||
Kind string `json:"kind"`
|
||||
Summary string `json:"summary"`
|
||||
Details map[string]any `json:"details,omitempty"`
|
||||
}
|
||||
|
||||
// SnapshotRecord stores the last fact snapshot pushed by a node sentinel.
|
||||
type SnapshotRecord struct {
|
||||
Node string `json:"node"`
|
||||
CollectedAt time.Time `json:"collected_at"`
|
||||
Snapshot sentinel.Snapshot `json:"snapshot"`
|
||||
}
|
||||
|
||||
// PageState is the UI/API view model.
|
||||
type PageState struct {
|
||||
LocalHost string `json:"local_host"`
|
||||
DefaultFlashHost string `json:"default_flash_host"`
|
||||
FlashHosts []string `json:"flash_hosts"`
|
||||
Nodes []inventory.NodeSpec `json:"nodes"`
|
||||
Jobs []*Job `json:"jobs"`
|
||||
Devices []Device `json:"devices"`
|
||||
Events []Event `json:"events"`
|
||||
Snapshots []SnapshotRecord `json:"snapshots"`
|
||||
Targets map[string]facts.Targets `json:"targets"`
|
||||
Artifacts map[string]ArtifactSummary `json:"artifacts"`
|
||||
}
|
||||
|
||||
// ArtifactSummary describes the latest built image for a node.
|
||||
type ArtifactSummary struct {
|
||||
Path string `json:"path"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
// App coordinates builds, flashes, sentinel snapshots, and the web UI state.
|
||||
type App struct {
|
||||
settings Settings
|
||||
inventory *inventory.Inventory
|
||||
metrics *Metrics
|
||||
|
||||
mu sync.RWMutex
|
||||
jobs map[string]*Job
|
||||
snapshots map[string]SnapshotRecord
|
||||
targets map[string]facts.Targets
|
||||
}
|
||||
|
||||
// NewApp creates a Metis service app instance.
|
||||
func NewApp(settings Settings) (*App, error) {
|
||||
if err := os.MkdirAll(settings.CacheDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.MkdirAll(settings.ArtifactDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(settings.HistoryPath), 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
inv, err := inventory.Load(settings.InventoryPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
app := &App{
|
||||
settings: settings,
|
||||
inventory: inv,
|
||||
metrics: NewMetrics(),
|
||||
jobs: map[string]*Job{},
|
||||
snapshots: map[string]SnapshotRecord{},
|
||||
targets: map[string]facts.Targets{},
|
||||
}
|
||||
_ = app.loadSnapshots()
|
||||
_ = app.loadTargets()
|
||||
return app, nil
|
||||
}
|
||||
|
||||
// State returns the current UI/API snapshot.
|
||||
func (a *App) State(deviceHost string) PageState {
|
||||
a.mu.RLock()
|
||||
jobs := make([]*Job, 0, len(a.jobs))
|
||||
for _, job := range a.jobs {
|
||||
copyJob := *job
|
||||
jobs = append(jobs, ©Job)
|
||||
}
|
||||
sort.Slice(jobs, func(i, j int) bool {
|
||||
return jobs[i].StartedAt.After(jobs[j].StartedAt)
|
||||
})
|
||||
|
||||
snaps := make([]SnapshotRecord, 0, len(a.snapshots))
|
||||
for _, snap := range a.snapshots {
|
||||
snaps = append(snaps, snap)
|
||||
}
|
||||
aTargets := map[string]facts.Targets{}
|
||||
for key, value := range a.targets {
|
||||
aTargets[key] = value
|
||||
}
|
||||
a.mu.RUnlock()
|
||||
|
||||
sort.Slice(snaps, func(i, j int) bool {
|
||||
return snaps[i].Node < snaps[j].Node
|
||||
})
|
||||
|
||||
devices, _ := a.ListDevices(deviceHost)
|
||||
return PageState{
|
||||
LocalHost: a.settings.LocalHost,
|
||||
DefaultFlashHost: a.settings.DefaultFlashHost,
|
||||
FlashHosts: append([]string{}, a.settings.FlashHosts...),
|
||||
Nodes: append([]inventory.NodeSpec{}, a.inventory.Nodes...),
|
||||
Jobs: jobs,
|
||||
Devices: devices,
|
||||
Events: a.recentEvents(40),
|
||||
Snapshots: snaps,
|
||||
Targets: aTargets,
|
||||
Artifacts: a.artifacts(),
|
||||
}
|
||||
}
|
||||
|
||||
// Build starts a background image build for a node.
|
||||
func (a *App) Build(node string) (*Job, error) {
|
||||
if _, _, err := a.inventory.FindNode(node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
job := a.newJob("build", node, "", "")
|
||||
go a.runBuild(job, false)
|
||||
return job, nil
|
||||
}
|
||||
|
||||
// Replace starts a background build+flash workflow for a node.
|
||||
func (a *App) Replace(node, host, device string) (*Job, error) {
|
||||
if host == "" {
|
||||
host = a.settings.DefaultFlashHost
|
||||
}
|
||||
if host != a.settings.LocalHost && host != a.settings.DefaultFlashHost {
|
||||
return nil, fmt.Errorf("flash host %s is not available on this Metis instance", host)
|
||||
}
|
||||
if _, _, err := a.inventory.FindNode(node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if _, err := a.ensureDevice(device); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
job := a.newJob("replace", node, host, device)
|
||||
go a.runBuild(job, true)
|
||||
return job, nil
|
||||
}
|
||||
|
||||
// StoreSnapshot records a pushed sentinel snapshot.
|
||||
func (a *App) StoreSnapshot(record SnapshotRecord) error {
|
||||
if record.Node == "" {
|
||||
record.Node = record.Snapshot.Hostname
|
||||
}
|
||||
if record.CollectedAt.IsZero() {
|
||||
record.CollectedAt = time.Now().UTC()
|
||||
}
|
||||
if strings.TrimSpace(record.Node) == "" {
|
||||
return fmt.Errorf("snapshot node required")
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.snapshots[record.Node] = record
|
||||
a.mu.Unlock()
|
||||
if err := a.persistSnapshots(); err != nil {
|
||||
return err
|
||||
}
|
||||
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
|
||||
a.appendEvent(Event{
|
||||
Time: record.CollectedAt,
|
||||
Kind: "sentinel.snapshot",
|
||||
Summary: fmt.Sprintf("Captured sentinel snapshot for %s", record.Node),
|
||||
Details: map[string]any{
|
||||
"node": record.Node,
|
||||
"kernel": record.Snapshot.Kernel,
|
||||
"k3s_version": record.Snapshot.K3sVersion,
|
||||
},
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// WatchSentinel recomputes class targets and logs meaningful drift.
|
||||
func (a *App) WatchSentinel() (*Event, error) {
|
||||
a.mu.RLock()
|
||||
snaps := make([]facts.Snapshot, 0, len(a.snapshots))
|
||||
for _, snap := range a.snapshots {
|
||||
snaps = append(snaps, facts.Snapshot{
|
||||
Hostname: snap.Node,
|
||||
Kernel: snap.Snapshot.Kernel,
|
||||
OSImage: snap.Snapshot.OSImage,
|
||||
K3sVersion: firstLine(snap.Snapshot.K3sVersion),
|
||||
Containerd: firstLine(snap.Snapshot.Containerd),
|
||||
PackageSample: snap.Snapshot.PackageSample,
|
||||
DropInsSample: snap.Snapshot.DropInsSample,
|
||||
})
|
||||
}
|
||||
prevTargets := map[string]facts.Targets{}
|
||||
for key, value := range a.targets {
|
||||
prevTargets[key] = value
|
||||
}
|
||||
a.mu.RUnlock()
|
||||
|
||||
nextTargets := facts.RecommendTargets(a.inventory, snaps)
|
||||
changes := diffTargets(prevTargets, nextTargets)
|
||||
|
||||
a.mu.Lock()
|
||||
a.targets = nextTargets
|
||||
a.mu.Unlock()
|
||||
if err := a.persistTargets(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
event := &Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "sentinel.watch",
|
||||
Summary: "Metis sentinel watch completed with no template changes",
|
||||
Details: map[string]any{
|
||||
"classes": len(nextTargets),
|
||||
"changes": 0,
|
||||
},
|
||||
}
|
||||
if len(changes) > 0 {
|
||||
event.Summary = fmt.Sprintf("Metis sentinel watch detected %d template change(s)", len(changes))
|
||||
event.Details["changes"] = changes
|
||||
}
|
||||
a.appendEvent(*event)
|
||||
a.metrics.RecordWatch("ok")
|
||||
a.metrics.SetDriftTargets(nextTargets, len(changes))
|
||||
return event, nil
|
||||
}
|
||||
|
||||
// ListDevices returns locally attached removable media that are safe candidates for flashing.
|
||||
func (a *App) ListDevices(host string) ([]Device, error) {
|
||||
if host == "" {
|
||||
host = a.settings.DefaultFlashHost
|
||||
}
|
||||
if host != a.settings.LocalHost && host != a.settings.DefaultFlashHost {
|
||||
return nil, fmt.Errorf("flash host %s is not attached to this Metis instance", host)
|
||||
}
|
||||
cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var payload struct {
|
||||
Blockdevices []struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path"`
|
||||
RM bool `json:"rm"`
|
||||
Hotplug bool `json:"hotplug"`
|
||||
Size any `json:"size"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Type string `json:"type"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &payload); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
devices := make([]Device, 0)
|
||||
for _, dev := range payload.Blockdevices {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
size := int64(0)
|
||||
switch value := dev.Size.(type) {
|
||||
case string:
|
||||
size, _ = strconv.ParseInt(value, 10, 64)
|
||||
case float64:
|
||||
size = int64(value)
|
||||
}
|
||||
if size <= 0 || size > a.settings.MaxDeviceBytes {
|
||||
continue
|
||||
}
|
||||
if dev.Tran != "usb" && !dev.RM && !dev.Hotplug {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, Device{
|
||||
Name: dev.Name,
|
||||
Path: dev.Path,
|
||||
Model: strings.TrimSpace(dev.Model),
|
||||
Transport: dev.Tran,
|
||||
Type: dev.Type,
|
||||
Removable: dev.RM,
|
||||
Hotplug: dev.Hotplug,
|
||||
SizeBytes: size,
|
||||
})
|
||||
}
|
||||
sort.Slice(devices, func(i, j int) bool { return devices[i].Path < devices[j].Path })
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
func (a *App) runBuild(job *Job, flash bool) {
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Status = JobRunning
|
||||
j.Stage = "download"
|
||||
j.Message = "Fetching base image"
|
||||
j.ProgressPct = 5
|
||||
})
|
||||
output := a.artifactPath(job.Node)
|
||||
cacheDir := a.settings.CacheDir
|
||||
|
||||
planData, err := plan.Build(a.inventory, job.Node, output, cacheDir)
|
||||
if err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
_, class, err := a.inventory.FindNode(job.Node)
|
||||
if err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
cacheImage := filepath.Join(cacheDir, filepath.Base(planData.Image))
|
||||
if err := image.Download(planData.Image, cacheImage); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Stage = "verify"
|
||||
j.Message = "Verifying base image checksum"
|
||||
j.ProgressPct = 18
|
||||
})
|
||||
if err := image.VerifyChecksum(cacheImage, class.Checksum); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Stage = "copy"
|
||||
j.Message = "Copying base image into artifact"
|
||||
j.ProgressPct = 35
|
||||
})
|
||||
if err := writer.WriteImage(context.Background(), cacheImage, output); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
files, err := plan.Files(a.inventory, job.Node)
|
||||
if err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Stage = "inject"
|
||||
j.Message = "Injecting node-specific rootfs config"
|
||||
j.ProgressPct = 70
|
||||
})
|
||||
if err := image.InjectRootFS(output, files); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordBuild(job.Node, "error")
|
||||
return
|
||||
}
|
||||
a.metrics.RecordBuild(job.Node, "ok")
|
||||
a.appendEvent(Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "image.build",
|
||||
Summary: fmt.Sprintf("Built replacement image for %s", job.Node),
|
||||
Details: map[string]any{"node": job.Node, "artifact": output},
|
||||
})
|
||||
|
||||
if !flash {
|
||||
a.completeJob(job.ID, func(j *Job) {
|
||||
j.Stage = "complete"
|
||||
j.Message = "Image build complete"
|
||||
j.ProgressPct = 100
|
||||
j.Artifact = output
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
a.setJob(job.ID, func(j *Job) {
|
||||
j.Stage = "preflight"
|
||||
j.Message = "Validating device and deleting stale node object"
|
||||
j.ProgressPct = 78
|
||||
j.Artifact = output
|
||||
})
|
||||
if _, err := a.ensureDevice(job.Device); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordFlash(job.Node, job.Host, "error")
|
||||
return
|
||||
}
|
||||
if err := deleteNodeObject(job.Node); err != nil {
|
||||
a.appendEvent(Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "node.delete.warning",
|
||||
Summary: fmt.Sprintf("Could not delete stale Kubernetes node object for %s", job.Node),
|
||||
Details: map[string]any{"node": job.Node, "error": err.Error()},
|
||||
})
|
||||
}
|
||||
if err := a.flashArtifact(job.ID, output); err != nil {
|
||||
a.failJob(job.ID, err)
|
||||
a.metrics.RecordFlash(job.Node, job.Host, "error")
|
||||
return
|
||||
}
|
||||
a.metrics.RecordFlash(job.Node, job.Host, "ok")
|
||||
a.appendEvent(Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "image.flash",
|
||||
Summary: fmt.Sprintf("Flashed %s image to %s on %s", job.Node, job.Device, job.Host),
|
||||
Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host},
|
||||
})
|
||||
a.completeJob(job.ID, func(j *Job) {
|
||||
j.Stage = "complete"
|
||||
j.Message = fmt.Sprintf("Flash complete. Move the card into %s and power-cycle it.", j.Node)
|
||||
j.ProgressPct = 100
|
||||
j.Artifact = output
|
||||
})
|
||||
}
|
||||
|
||||
func (a *App) flashArtifact(jobID, artifact string) error {
|
||||
info, err := os.Stat(artifact)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a.setJob(jobID, func(j *Job) {
|
||||
j.Stage = "flash"
|
||||
j.Message = "Writing image to removable media"
|
||||
j.ProgressPct = 82
|
||||
j.Total = info.Size()
|
||||
})
|
||||
err = writer.WriteImageWithProgress(context.Background(), artifact, a.job(jobID).Device, func(written, total int64) {
|
||||
pct := 82.0
|
||||
if total > 0 {
|
||||
pct = 82.0 + (float64(written)/float64(total))*17.0
|
||||
}
|
||||
a.setJob(jobID, func(j *Job) {
|
||||
j.Written = written
|
||||
j.Total = total
|
||||
j.ProgressPct = pct
|
||||
j.Message = fmt.Sprintf("Flashing %s of %s", humanBytes(written), humanBytes(total))
|
||||
})
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (a *App) ensureDevice(path string) (*Device, error) {
|
||||
devices, err := a.ListDevices(a.settings.DefaultFlashHost)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, device := range devices {
|
||||
if device.Path == path {
|
||||
return &device, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("device %s is not a current removable flash candidate", path)
|
||||
}
|
||||
|
||||
func (a *App) newJob(kind, node, host, device string) *Job {
|
||||
job := &Job{
|
||||
ID: fmt.Sprintf("%d", time.Now().UTC().UnixNano()),
|
||||
Kind: kind,
|
||||
Node: node,
|
||||
Host: host,
|
||||
Device: device,
|
||||
Status: JobQueued,
|
||||
ProgressPct: 0,
|
||||
StartedAt: time.Now().UTC(),
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.jobs[job.ID] = job
|
||||
a.mu.Unlock()
|
||||
return job
|
||||
}
|
||||
|
||||
func (a *App) job(id string) *Job {
|
||||
a.mu.RLock()
|
||||
defer a.mu.RUnlock()
|
||||
return a.jobs[id]
|
||||
}
|
||||
|
||||
func (a *App) setJob(id string, update func(*Job)) {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
job := a.jobs[id]
|
||||
if job == nil {
|
||||
return
|
||||
}
|
||||
update(job)
|
||||
}
|
||||
|
||||
func (a *App) failJob(id string, err error) {
|
||||
a.completeJob(id, func(j *Job) {
|
||||
j.Status = JobError
|
||||
j.Error = err.Error()
|
||||
j.Message = err.Error()
|
||||
})
|
||||
}
|
||||
|
||||
func (a *App) completeJob(id string, update func(*Job)) {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
job := a.jobs[id]
|
||||
if job == nil {
|
||||
return
|
||||
}
|
||||
update(job)
|
||||
if job.Status != JobError {
|
||||
job.Status = JobDone
|
||||
}
|
||||
job.FinishedAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
func (a *App) appendEvent(event Event) {
|
||||
line, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
f, err := os.OpenFile(a.settings.HistoryPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
_, _ = f.Write(append(line, '\n'))
|
||||
}
|
||||
|
||||
func (a *App) recentEvents(limit int) []Event {
|
||||
f, err := os.Open(a.settings.HistoryPath)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer f.Close()
|
||||
events := make([]Event, 0, limit)
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
var event Event
|
||||
if err := json.Unmarshal(scanner.Bytes(), &event); err != nil {
|
||||
continue
|
||||
}
|
||||
events = append(events, event)
|
||||
}
|
||||
if len(events) > limit {
|
||||
events = events[len(events)-limit:]
|
||||
}
|
||||
for i, j := 0, len(events)-1; i < j; i, j = i+1, j-1 {
|
||||
events[i], events[j] = events[j], events[i]
|
||||
}
|
||||
return events
|
||||
}
|
||||
|
||||
func (a *App) artifacts() map[string]ArtifactSummary {
|
||||
result := map[string]ArtifactSummary{}
|
||||
for _, node := range a.inventory.Nodes {
|
||||
path := a.artifactPath(node.Name)
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
result[node.Name] = ArtifactSummary{
|
||||
Path: path,
|
||||
UpdatedAt: info.ModTime().UTC(),
|
||||
SizeBytes: info.Size(),
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (a *App) artifactPath(node string) string {
|
||||
return filepath.Join(a.settings.ArtifactDir, fmt.Sprintf("%s.img", node))
|
||||
}
|
||||
|
||||
func (a *App) loadSnapshots() error {
|
||||
data, err := os.ReadFile(a.settings.SnapshotsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var snapshots map[string]SnapshotRecord
|
||||
if err := json.Unmarshal(data, &snapshots); err != nil {
|
||||
return err
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.snapshots = snapshots
|
||||
a.mu.Unlock()
|
||||
for _, snap := range snapshots {
|
||||
a.metrics.RecordSnapshot(snap.Node, "ok", snap.CollectedAt)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *App) persistSnapshots() error {
|
||||
a.mu.RLock()
|
||||
data, err := json.MarshalIndent(a.snapshots, "", " ")
|
||||
a.mu.RUnlock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(a.settings.SnapshotsPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(a.settings.SnapshotsPath, data, 0o644)
|
||||
}
|
||||
|
||||
func (a *App) loadTargets() error {
|
||||
data, err := os.ReadFile(a.settings.TargetsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var targets map[string]facts.Targets
|
||||
if err := json.Unmarshal(data, &targets); err != nil {
|
||||
return err
|
||||
}
|
||||
a.mu.Lock()
|
||||
a.targets = targets
|
||||
a.mu.Unlock()
|
||||
a.metrics.SetDriftTargets(targets, 0)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *App) persistTargets() error {
|
||||
a.mu.RLock()
|
||||
data, err := json.MarshalIndent(a.targets, "", " ")
|
||||
a.mu.RUnlock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(a.settings.TargetsPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(a.settings.TargetsPath, data, 0o644)
|
||||
}
|
||||
|
||||
func diffTargets(prev, next map[string]facts.Targets) []string {
|
||||
classes := map[string]struct{}{}
|
||||
for class := range prev {
|
||||
classes[class] = struct{}{}
|
||||
}
|
||||
for class := range next {
|
||||
classes[class] = struct{}{}
|
||||
}
|
||||
out := make([]string, 0)
|
||||
for class := range classes {
|
||||
if !targetsEqual(prev[class], next[class]) {
|
||||
out = append(out, class)
|
||||
}
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func targetsEqual(a, b facts.Targets) bool {
|
||||
if a.Kernel != b.Kernel || a.OSImage != b.OSImage || a.Containerd != b.Containerd || a.K3sVersion != b.K3sVersion {
|
||||
return false
|
||||
}
|
||||
if len(a.Packages) != len(b.Packages) {
|
||||
return false
|
||||
}
|
||||
for key, value := range a.Packages {
|
||||
if b.Packages[key] != value {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func humanBytes(value int64) string {
|
||||
const unit = 1024
|
||||
if value < unit {
|
||||
return fmt.Sprintf("%d B", value)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := value / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %ciB", float64(value)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func firstLine(value string) string {
|
||||
value = strings.TrimSpace(value)
|
||||
if idx := strings.IndexByte(value, '\n'); idx >= 0 {
|
||||
return strings.TrimSpace(value[:idx])
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func deleteNodeObject(node string) error {
|
||||
if err := deleteNodeObjectInCluster(node); err == nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.Command("kubectl", "delete", "node", node, "--ignore-not-found")
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("delete node: %w: %s", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func deleteNodeObjectInCluster(node string) error {
|
||||
host := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_HOST"))
|
||||
port := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_PORT"))
|
||||
if host == "" || port == "" {
|
||||
return errors.New("not running in cluster")
|
||||
}
|
||||
token, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
caPEM, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(caPEM) {
|
||||
return errors.New("append kubernetes CA")
|
||||
}
|
||||
client := &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{RootCAs: pool},
|
||||
},
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodDelete, fmt.Sprintf("https://%s:%s/api/v1/nodes/%s", host, port, node), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(string(token)))
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusAccepted {
|
||||
return nil
|
||||
}
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return fmt.Errorf("delete node %s failed: %s: %s", node, resp.Status, strings.TrimSpace(string(body)))
|
||||
}
|
||||
188
pkg/service/metrics.go
Normal file
188
pkg/service/metrics.go
Normal file
@ -0,0 +1,188 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"metis/pkg/facts"
|
||||
)
|
||||
|
||||
// Metrics captures the small Prometheus surface exported by Metis.
|
||||
type Metrics struct {
|
||||
mu sync.RWMutex
|
||||
|
||||
builds map[string]int
|
||||
flashes map[string]int
|
||||
snapshots map[string]int
|
||||
lastSnapshotUnix map[string]float64
|
||||
watches map[string]int
|
||||
lastWatchSuccess float64
|
||||
classDriftCounts map[string]int
|
||||
lastWatchChangeSize float64
|
||||
}
|
||||
|
||||
// NewMetrics builds a zero-value metrics registry.
|
||||
func NewMetrics() *Metrics {
|
||||
return &Metrics{
|
||||
builds: map[string]int{},
|
||||
flashes: map[string]int{},
|
||||
snapshots: map[string]int{},
|
||||
lastSnapshotUnix: map[string]float64{},
|
||||
watches: map[string]int{},
|
||||
classDriftCounts: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Metrics) RecordBuild(node, status string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.builds[counterKey(node, status)]++
|
||||
}
|
||||
|
||||
func (m *Metrics) RecordFlash(node, host, status string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.flashes[counterKey(node, host, status)]++
|
||||
}
|
||||
|
||||
func (m *Metrics) RecordSnapshot(node, status string, ts time.Time) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.snapshots[counterKey(node, status)]++
|
||||
if !ts.IsZero() {
|
||||
m.lastSnapshotUnix[node] = float64(ts.Unix())
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Metrics) RecordWatch(status string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.watches[counterKey(status)]++
|
||||
if status == "ok" {
|
||||
m.lastWatchSuccess = float64(time.Now().UTC().Unix())
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Metrics) SetDriftTargets(targets map[string]facts.Targets, changed int) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.classDriftCounts = map[string]int{}
|
||||
for class, target := range targets {
|
||||
count := 0
|
||||
if strings.TrimSpace(target.Kernel) != "" {
|
||||
count++
|
||||
}
|
||||
if strings.TrimSpace(target.OSImage) != "" {
|
||||
count++
|
||||
}
|
||||
if strings.TrimSpace(target.Containerd) != "" {
|
||||
count++
|
||||
}
|
||||
if strings.TrimSpace(target.K3sVersion) != "" {
|
||||
count++
|
||||
}
|
||||
count += len(target.Packages)
|
||||
m.classDriftCounts[class] = count
|
||||
}
|
||||
m.lastWatchChangeSize = float64(changed)
|
||||
}
|
||||
|
||||
// Render writes a Prometheus text exposition response.
|
||||
func (m *Metrics) Render(w io.Writer) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_builds_total Replacement image builds by node and status")
|
||||
fmt.Fprintln(w, "# TYPE metis_builds_total counter")
|
||||
for _, key := range sortedKeys(m.builds) {
|
||||
parts := splitKey(key, 2)
|
||||
node, status := parts[0], parts[1]
|
||||
fmt.Fprintf(w, "metis_builds_total{node=%q,status=%q} %d\n", node, status, m.builds[key])
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_flashes_total Replacement flashes by node, host, and status")
|
||||
fmt.Fprintln(w, "# TYPE metis_flashes_total counter")
|
||||
for _, key := range sortedKeys(m.flashes) {
|
||||
parts := splitKey(key, 3)
|
||||
node, host, status := parts[0], parts[1], parts[2]
|
||||
fmt.Fprintf(w, "metis_flashes_total{node=%q,host=%q,status=%q} %d\n", node, host, status, m.flashes[key])
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_sentinel_snapshots_total Sentinel snapshots accepted by node and status")
|
||||
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshots_total counter")
|
||||
for _, key := range sortedKeys(m.snapshots) {
|
||||
parts := splitKey(key, 2)
|
||||
node, status := parts[0], parts[1]
|
||||
fmt.Fprintf(w, "metis_sentinel_snapshots_total{node=%q,status=%q} %d\n", node, status, m.snapshots[key])
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_sentinel_snapshot_timestamp_seconds Last accepted sentinel snapshot timestamp by node")
|
||||
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshot_timestamp_seconds gauge")
|
||||
for _, node := range sortedFloatKeys(m.lastSnapshotUnix) {
|
||||
fmt.Fprintf(w, "metis_sentinel_snapshot_timestamp_seconds{node=%q} %.0f\n", node, m.lastSnapshotUnix[node])
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_sentinel_watch_total Sentinel watch runs by status")
|
||||
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_total counter")
|
||||
for _, key := range sortedKeys(m.watches) {
|
||||
status := splitKey(key, 1)[0]
|
||||
fmt.Fprintf(w, "metis_sentinel_watch_total{status=%q} %d\n", status, m.watches[key])
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_sentinel_watch_last_success_timestamp_seconds Last successful sentinel watch timestamp")
|
||||
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_last_success_timestamp_seconds gauge")
|
||||
fmt.Fprintf(w, "metis_sentinel_watch_last_success_timestamp_seconds %.0f\n", m.lastWatchSuccess)
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_sentinel_watch_changed_classes Number of class target sets changed by the last watch")
|
||||
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_changed_classes gauge")
|
||||
fmt.Fprintf(w, "metis_sentinel_watch_changed_classes %.0f\n", m.lastWatchChangeSize)
|
||||
|
||||
fmt.Fprintln(w, "# HELP metis_class_target_fields Count of populated target fields per class")
|
||||
fmt.Fprintln(w, "# TYPE metis_class_target_fields gauge")
|
||||
for _, class := range sortedFloatKeysInt(m.classDriftCounts) {
|
||||
fmt.Fprintf(w, "metis_class_target_fields{class=%q} %d\n", class, m.classDriftCounts[class])
|
||||
}
|
||||
}
|
||||
|
||||
func counterKey(parts ...string) string {
|
||||
return strings.Join(parts, "\x00")
|
||||
}
|
||||
|
||||
func splitKey(key string, want int) []string {
|
||||
parts := strings.Split(key, "\x00")
|
||||
for len(parts) < want {
|
||||
parts = append(parts, "")
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
func sortedKeys[T any](m map[string]T) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
func sortedFloatKeys(m map[string]float64) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
func sortedFloatKeysInt(m map[string]int) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
628
pkg/service/server.go
Normal file
628
pkg/service/server.go
Normal file
@ -0,0 +1,628 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"html/template"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type userContext struct {
|
||||
Name string
|
||||
Groups []string
|
||||
}
|
||||
|
||||
type pageData struct {
|
||||
State PageState
|
||||
AllowedGroups []string
|
||||
DefaultMessage string
|
||||
BootJSON template.JS
|
||||
}
|
||||
|
||||
// Handler returns the Metis HTTP handler.
|
||||
func (a *App) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/healthz", a.handleHealth)
|
||||
mux.HandleFunc("/metrics", a.handleMetrics)
|
||||
mux.HandleFunc("/internal/sentinel/snapshot", a.handleInternalSnapshot)
|
||||
mux.HandleFunc("/internal/sentinel/watch", a.handleInternalWatch)
|
||||
mux.HandleFunc("/api/state", a.withUIAuth(a.handleState))
|
||||
mux.HandleFunc("/api/devices", a.withUIAuth(a.handleDevices))
|
||||
mux.HandleFunc("/api/jobs/build", a.withUIAuth(a.handleBuild))
|
||||
mux.HandleFunc("/api/jobs/replace", a.withUIAuth(a.handleReplace))
|
||||
mux.HandleFunc("/api/sentinel/watch", a.withUIAuth(a.handleWatch))
|
||||
mux.HandleFunc("/", a.withUIAuth(a.handleIndex))
|
||||
return mux
|
||||
}
|
||||
|
||||
func (a *App) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, http.StatusOK, map[string]any{"status": "ok", "service": "metis"})
|
||||
}
|
||||
|
||||
func (a *App) handleMetrics(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
a.metrics.Render(w)
|
||||
}
|
||||
|
||||
func (a *App) handleInternalSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
var record SnapshotRecord
|
||||
if err := json.NewDecoder(r.Body).Decode(&record); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if err := a.StoreSnapshot(record); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"status": "ok"})
|
||||
}
|
||||
|
||||
func (a *App) handleInternalWatch(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
event, err := a.WatchSentinel()
|
||||
if err != nil {
|
||||
a.metrics.RecordWatch("error")
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, event)
|
||||
}
|
||||
|
||||
func (a *App) handleState(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
writeJSON(w, http.StatusOK, a.State(host))
|
||||
}
|
||||
|
||||
func (a *App) handleDevices(w http.ResponseWriter, r *http.Request) {
|
||||
host := r.URL.Query().Get("host")
|
||||
devices, err := a.ListDevices(host)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"devices": devices})
|
||||
}
|
||||
|
||||
func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
node := requestValue(r, "node")
|
||||
job, err := a.Build(node)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusAccepted, job)
|
||||
}
|
||||
|
||||
func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
node := requestValue(r, "node")
|
||||
host := requestValue(r, "host")
|
||||
device := requestValue(r, "device")
|
||||
job, err := a.Replace(node, host, device)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusAccepted, job)
|
||||
}
|
||||
|
||||
func (a *App) handleWatch(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
event, err := a.WatchSentinel()
|
||||
if err != nil {
|
||||
a.metrics.RecordWatch("error")
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, event)
|
||||
}
|
||||
|
||||
func (a *App) handleIndex(w http.ResponseWriter, r *http.Request) {
|
||||
state := a.State(a.settings.DefaultFlashHost)
|
||||
payload, _ := json.Marshal(state)
|
||||
data := pageData{
|
||||
State: state,
|
||||
AllowedGroups: append([]string{}, a.settings.AllowedGroups...),
|
||||
BootJSON: template.JS(payload),
|
||||
}
|
||||
_ = metisPage.Execute(w, data)
|
||||
}
|
||||
|
||||
func (a *App) withUIAuth(next http.HandlerFunc) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
user, ok := a.authorize(r)
|
||||
if !ok {
|
||||
http.Error(w, "forbidden", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
if user.Name != "" {
|
||||
w.Header().Set("X-Metis-User", user.Name)
|
||||
}
|
||||
next(w, r)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) authorize(r *http.Request) (userContext, bool) {
|
||||
user := strings.TrimSpace(r.Header.Get("X-Auth-Request-User"))
|
||||
if user == "" {
|
||||
user = strings.TrimSpace(r.Header.Get("X-Forwarded-User"))
|
||||
}
|
||||
if user == "" {
|
||||
return userContext{}, false
|
||||
}
|
||||
groups := splitHeaderList(r.Header.Get("X-Auth-Request-Groups"))
|
||||
for _, allowedUser := range a.settings.AllowedUsers {
|
||||
if allowedUser == user {
|
||||
return userContext{Name: user, Groups: groups}, true
|
||||
}
|
||||
}
|
||||
for _, group := range groups {
|
||||
for _, allowed := range a.settings.AllowedGroups {
|
||||
if group == allowed {
|
||||
return userContext{Name: user, Groups: groups}, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return userContext{Name: user, Groups: groups}, false
|
||||
}
|
||||
|
||||
func splitHeaderList(raw string) []string {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(raw, ",")
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
part = strings.TrimSpace(part)
|
||||
if part != "" {
|
||||
out = append(out, part)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func requestValue(r *http.Request, key string) string {
|
||||
if err := r.ParseForm(); err == nil {
|
||||
if value := strings.TrimSpace(r.Form.Get(key)); value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
var payload map[string]any
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err == nil {
|
||||
if value, ok := payload[key].(string); ok {
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, payload any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
}
|
||||
|
||||
var metisPage = template.Must(template.New("metis").Parse(`<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Metis Control</title>
|
||||
<style>
|
||||
:root{
|
||||
--ink:#111318;
|
||||
--muted:#616778;
|
||||
--line:rgba(17,19,24,.12);
|
||||
--paper:rgba(255,255,255,.84);
|
||||
--paper-strong:#ffffff;
|
||||
--brand:#1d5f8c;
|
||||
--brand-deep:#153b59;
|
||||
--accent:#d47b37;
|
||||
--success:#1b8f5a;
|
||||
--danger:#a63c35;
|
||||
--shadow:0 20px 60px rgba(17,19,24,.12);
|
||||
}
|
||||
*{box-sizing:border-box}
|
||||
body{
|
||||
margin:0;
|
||||
min-height:100vh;
|
||||
font-family:"Avenir Next","Trebuchet MS","Segoe UI",sans-serif;
|
||||
color:var(--ink);
|
||||
background:
|
||||
radial-gradient(circle at top left, rgba(212,123,55,.18), transparent 30rem),
|
||||
radial-gradient(circle at top right, rgba(29,95,140,.18), transparent 32rem),
|
||||
linear-gradient(180deg, #f8f4ee 0%, #eef2f5 48%, #e4edf2 100%);
|
||||
}
|
||||
.frame{
|
||||
max-width:1280px;
|
||||
margin:0 auto;
|
||||
padding:2rem 1.25rem 3rem;
|
||||
}
|
||||
.mast{
|
||||
display:flex;
|
||||
justify-content:space-between;
|
||||
align-items:flex-end;
|
||||
gap:1.5rem;
|
||||
margin-bottom:1.5rem;
|
||||
}
|
||||
.eyebrow{
|
||||
letter-spacing:.14em;
|
||||
text-transform:uppercase;
|
||||
font-size:.72rem;
|
||||
color:var(--brand-deep);
|
||||
margin-bottom:.35rem;
|
||||
font-weight:700;
|
||||
}
|
||||
h1{
|
||||
margin:0;
|
||||
font-size:clamp(2rem,4vw,3.4rem);
|
||||
line-height:1;
|
||||
}
|
||||
.sub{
|
||||
max-width:54rem;
|
||||
color:var(--muted);
|
||||
margin-top:.7rem;
|
||||
font-size:1rem;
|
||||
}
|
||||
.badge{
|
||||
display:inline-flex;
|
||||
align-items:center;
|
||||
gap:.45rem;
|
||||
padding:.7rem .95rem;
|
||||
background:rgba(255,255,255,.72);
|
||||
border:1px solid rgba(21,59,89,.12);
|
||||
border-radius:999px;
|
||||
box-shadow:var(--shadow);
|
||||
font-size:.9rem;
|
||||
}
|
||||
.grid{
|
||||
display:grid;
|
||||
grid-template-columns:1.2fr .9fr;
|
||||
gap:1rem;
|
||||
}
|
||||
.stack{
|
||||
display:grid;
|
||||
gap:1rem;
|
||||
}
|
||||
.card{
|
||||
background:var(--paper);
|
||||
backdrop-filter:blur(14px);
|
||||
border:1px solid var(--line);
|
||||
border-radius:1.25rem;
|
||||
padding:1.1rem;
|
||||
box-shadow:var(--shadow);
|
||||
}
|
||||
.card h2{
|
||||
margin:0 0 .35rem;
|
||||
font-size:1rem;
|
||||
text-transform:uppercase;
|
||||
letter-spacing:.1em;
|
||||
color:var(--brand-deep);
|
||||
}
|
||||
.hint{
|
||||
color:var(--muted);
|
||||
font-size:.92rem;
|
||||
margin-bottom:1rem;
|
||||
}
|
||||
.form-grid{
|
||||
display:grid;
|
||||
grid-template-columns:repeat(2,minmax(0,1fr));
|
||||
gap:.85rem;
|
||||
}
|
||||
label{
|
||||
display:grid;
|
||||
gap:.35rem;
|
||||
font-weight:600;
|
||||
font-size:.92rem;
|
||||
}
|
||||
select, button{
|
||||
width:100%;
|
||||
border-radius:.85rem;
|
||||
border:1px solid rgba(17,19,24,.14);
|
||||
padding:.85rem .95rem;
|
||||
font:inherit;
|
||||
}
|
||||
button{
|
||||
cursor:pointer;
|
||||
background:linear-gradient(135deg,var(--brand) 0%,var(--brand-deep) 100%);
|
||||
color:#fff;
|
||||
border:none;
|
||||
font-weight:700;
|
||||
letter-spacing:.03em;
|
||||
box-shadow:0 14px 30px rgba(21,59,89,.18);
|
||||
}
|
||||
button.secondary{
|
||||
background:#fff;
|
||||
color:var(--ink);
|
||||
border:1px solid rgba(17,19,24,.14);
|
||||
box-shadow:none;
|
||||
}
|
||||
.actions{
|
||||
display:grid;
|
||||
grid-template-columns:repeat(3,minmax(0,1fr));
|
||||
gap:.7rem;
|
||||
margin-top:.9rem;
|
||||
}
|
||||
.list{
|
||||
display:grid;
|
||||
gap:.7rem;
|
||||
max-height:30rem;
|
||||
overflow:auto;
|
||||
}
|
||||
.item{
|
||||
border:1px solid rgba(17,19,24,.1);
|
||||
border-radius:1rem;
|
||||
padding:.85rem .95rem;
|
||||
background:rgba(255,255,255,.8);
|
||||
}
|
||||
.item-head{
|
||||
display:flex;
|
||||
justify-content:space-between;
|
||||
gap:1rem;
|
||||
margin-bottom:.35rem;
|
||||
font-weight:700;
|
||||
}
|
||||
.meta{
|
||||
color:var(--muted);
|
||||
font-size:.85rem;
|
||||
}
|
||||
.bar{
|
||||
height:.55rem;
|
||||
background:rgba(17,19,24,.08);
|
||||
border-radius:999px;
|
||||
overflow:hidden;
|
||||
margin-top:.7rem;
|
||||
}
|
||||
.bar > span{
|
||||
display:block;
|
||||
height:100%;
|
||||
background:linear-gradient(90deg,var(--accent),var(--brand));
|
||||
}
|
||||
.pill{
|
||||
display:inline-block;
|
||||
padding:.2rem .55rem;
|
||||
border-radius:999px;
|
||||
font-size:.75rem;
|
||||
text-transform:uppercase;
|
||||
letter-spacing:.08em;
|
||||
background:rgba(21,59,89,.08);
|
||||
color:var(--brand-deep);
|
||||
}
|
||||
.pill.done{background:rgba(27,143,90,.12);color:var(--success)}
|
||||
.pill.error{background:rgba(166,60,53,.12);color:var(--danger)}
|
||||
.pill.running{background:rgba(212,123,55,.12);color:#9a5a20}
|
||||
.mini{
|
||||
display:grid;
|
||||
grid-template-columns:repeat(2,minmax(0,1fr));
|
||||
gap:.7rem;
|
||||
}
|
||||
.stat{
|
||||
padding:.8rem .9rem;
|
||||
border-radius:1rem;
|
||||
background:rgba(255,255,255,.72);
|
||||
border:1px solid rgba(17,19,24,.08);
|
||||
}
|
||||
.stat strong{display:block;font-size:1.35rem}
|
||||
code{
|
||||
font-family:"IBM Plex Mono","SFMono-Regular","Menlo",monospace;
|
||||
font-size:.88em;
|
||||
}
|
||||
@media (max-width: 980px){
|
||||
.grid,.form-grid,.actions,.mini{grid-template-columns:1fr}
|
||||
.mast{align-items:flex-start;flex-direction:column}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<main class="frame">
|
||||
<section class="mast">
|
||||
<div>
|
||||
<div class="eyebrow">Atlas Recovery Plane</div>
|
||||
<h1>Metis Control</h1>
|
||||
<p class="sub">Build replacement node images, verify removable media on the Texas flash host, and keep image templates fresh with sentinel-driven drift tracking.</p>
|
||||
</div>
|
||||
<div class="badge"><strong>Default flash host:</strong> <span id="default-host">{{.State.DefaultFlashHost}}</span></div>
|
||||
</section>
|
||||
|
||||
<section class="grid">
|
||||
<div class="stack">
|
||||
<article class="card">
|
||||
<h2>Replacement Run</h2>
|
||||
<p class="hint">This UI is meant for the one-shot recovery path: build the node image, verify the card on the flash host, then write it and hand off only the physical swap.</p>
|
||||
<div class="form-grid">
|
||||
<label>Target node
|
||||
<select id="node-select"></select>
|
||||
</label>
|
||||
<label>Flash host
|
||||
<select id="host-select"></select>
|
||||
</label>
|
||||
<label style="grid-column:1 / -1">Detected removable media
|
||||
<select id="device-select"></select>
|
||||
</label>
|
||||
</div>
|
||||
<div class="actions">
|
||||
<button class="secondary" id="refresh-devices">Refresh media</button>
|
||||
<button class="secondary" id="build-only">Build image only</button>
|
||||
<button id="replace-run">Build and flash</button>
|
||||
</div>
|
||||
</article>
|
||||
|
||||
<article class="card">
|
||||
<h2>Live Jobs</h2>
|
||||
<p class="hint">Progress updates stream from the running Metis operation. The replacement flow automatically tries to clear the stale Kubernetes node object before the card write.</p>
|
||||
<div id="jobs" class="list"></div>
|
||||
</article>
|
||||
</div>
|
||||
|
||||
<div class="stack">
|
||||
<article class="card">
|
||||
<h2>Sentinel Watch</h2>
|
||||
<p class="hint">Ariadne should hit the internal sentinel watch route on a schedule. You can also run it manually here when you want the latest template recommendations immediately.</p>
|
||||
<div class="mini">
|
||||
<div class="stat">
|
||||
<span class="meta">Tracked nodes</span>
|
||||
<strong id="snapshot-count">0</strong>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="meta">Class targets</span>
|
||||
<strong id="target-count">0</strong>
|
||||
</div>
|
||||
</div>
|
||||
<div class="actions" style="grid-template-columns:1fr">
|
||||
<button id="sentinel-watch">Run sentinel watch now</button>
|
||||
</div>
|
||||
</article>
|
||||
|
||||
<article class="card">
|
||||
<h2>Recent Changes</h2>
|
||||
<p class="hint">This stream keeps the image/template story digestible: builds, flashes, snapshot intake, and sentinel-driven target changes all land here.</p>
|
||||
<div id="events" class="list"></div>
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
<script id="boot" type="application/json">{{.BootJSON}}</script>
|
||||
<script>
|
||||
const boot = JSON.parse(document.getElementById('boot').textContent);
|
||||
let state = boot;
|
||||
const nodeSelect = document.getElementById('node-select');
|
||||
const hostSelect = document.getElementById('host-select');
|
||||
const deviceSelect = document.getElementById('device-select');
|
||||
const jobsEl = document.getElementById('jobs');
|
||||
const eventsEl = document.getElementById('events');
|
||||
const snapshotCountEl = document.getElementById('snapshot-count');
|
||||
const targetCountEl = document.getElementById('target-count');
|
||||
|
||||
function fmtTime(value){
|
||||
if(!value){ return 'pending'; }
|
||||
const date = new Date(value);
|
||||
return isNaN(date.getTime()) ? value : date.toLocaleString();
|
||||
}
|
||||
function fmtBytes(value){
|
||||
if(!value){ return '0 B'; }
|
||||
const units = ['B','KiB','MiB','GiB','TiB'];
|
||||
let size = Number(value);
|
||||
let idx = 0;
|
||||
while(size >= 1024 && idx < units.length - 1){
|
||||
size /= 1024;
|
||||
idx += 1;
|
||||
}
|
||||
return size.toFixed(size >= 10 || idx === 0 ? 0 : 1) + ' ' + units[idx];
|
||||
}
|
||||
function setOptions(select, values, labeler){
|
||||
const current = select.value;
|
||||
select.innerHTML = '';
|
||||
values.forEach((value)=>{
|
||||
const option = document.createElement('option');
|
||||
option.value = value;
|
||||
option.textContent = labeler ? labeler(value) : value;
|
||||
select.appendChild(option);
|
||||
});
|
||||
if(current && values.includes(current)){ select.value = current; }
|
||||
}
|
||||
function render(){
|
||||
setOptions(nodeSelect, state.nodes.map((n)=>n.name));
|
||||
setOptions(hostSelect, state.flash_hosts);
|
||||
if(!hostSelect.value){ hostSelect.value = state.default_flash_host; }
|
||||
setOptions(deviceSelect, state.devices.map((d)=>d.path), (path)=>{
|
||||
const dev = state.devices.find((item)=>item.path === path);
|
||||
if(!dev){ return path; }
|
||||
return dev.path + ' · ' + fmtBytes(dev.size_bytes) + ' · ' + (dev.model || dev.transport || 'removable media');
|
||||
});
|
||||
|
||||
jobsEl.innerHTML = '';
|
||||
const jobs = state.jobs.length ? state.jobs : [{kind:'idle',status:'done',message:'No active or recent Metis jobs yet.',progress_pct:100,started_at:new Date().toISOString(),finished_at:new Date().toISOString()}];
|
||||
jobs.forEach((job)=>{
|
||||
const wrap = document.createElement('div');
|
||||
wrap.className = 'item';
|
||||
const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : ''));
|
||||
const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : '');
|
||||
const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : '');
|
||||
const progress = job.written_bytes ? (fmtBytes(job.written_bytes) + ' / ' + fmtBytes(job.total_bytes)) : '';
|
||||
const detail = progress + (job.artifact ? ' · ' + job.artifact : '') + (job.error ? ' · ' + job.error : '');
|
||||
wrap.innerHTML =
|
||||
'<div class="item-head">' +
|
||||
'<span>' + title + '</span>' +
|
||||
'<span class="pill ' + statusClass + '">' + job.status + '</span>' +
|
||||
'</div>' +
|
||||
'<div>' + (job.message || job.stage || 'queued') + '</div>' +
|
||||
'<div class="meta">' + started + '</div>' +
|
||||
'<div class="meta">' + detail + '</div>' +
|
||||
'<div class="bar"><span style="width:' + Math.max(0, Math.min(100, job.progress_pct || 0)) + '%"></span></div>';
|
||||
jobsEl.appendChild(wrap);
|
||||
});
|
||||
|
||||
eventsEl.innerHTML = '';
|
||||
state.events.forEach((event)=>{
|
||||
const wrap = document.createElement('div');
|
||||
wrap.className = 'item';
|
||||
wrap.innerHTML =
|
||||
'<div class="item-head">' +
|
||||
'<span>' + event.summary + '</span>' +
|
||||
'<span class="meta">' + fmtTime(event.time) + '</span>' +
|
||||
'</div>' +
|
||||
'<div class="meta"><code>' + event.kind + '</code></div>';
|
||||
eventsEl.appendChild(wrap);
|
||||
});
|
||||
snapshotCountEl.textContent = state.snapshots.length;
|
||||
targetCountEl.textContent = Object.keys(state.targets || {}).length;
|
||||
}
|
||||
async function refreshState(){
|
||||
const host = hostSelect.value || state.default_flash_host;
|
||||
const resp = await fetch('/api/state?host=' + encodeURIComponent(host));
|
||||
if(resp.ok){
|
||||
state = await resp.json();
|
||||
render();
|
||||
}
|
||||
}
|
||||
async function post(path, body){
|
||||
const resp = await fetch(path, {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
});
|
||||
if(!resp.ok){
|
||||
const text = await resp.text();
|
||||
throw new Error(text || ('Request failed for ' + path));
|
||||
}
|
||||
return resp.json();
|
||||
}
|
||||
|
||||
document.getElementById('refresh-devices').addEventListener('click', async ()=>{
|
||||
await refreshState();
|
||||
});
|
||||
document.getElementById('build-only').addEventListener('click', async ()=>{
|
||||
await post('/api/jobs/build', {node: nodeSelect.value});
|
||||
await refreshState();
|
||||
});
|
||||
document.getElementById('replace-run').addEventListener('click', async ()=>{
|
||||
await post('/api/jobs/replace', {node: nodeSelect.value, host: hostSelect.value, device: deviceSelect.value});
|
||||
await refreshState();
|
||||
});
|
||||
document.getElementById('sentinel-watch').addEventListener('click', async ()=>{
|
||||
await post('/api/sentinel/watch', {});
|
||||
await refreshState();
|
||||
});
|
||||
hostSelect.addEventListener('change', refreshState);
|
||||
render();
|
||||
setInterval(refreshState, 5000);
|
||||
</script>
|
||||
</body>
|
||||
</html>`))
|
||||
146
pkg/service/server_test.go
Normal file
146
pkg/service/server_test.go
Normal file
@ -0,0 +1,146 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"metis/pkg/sentinel"
|
||||
)
|
||||
|
||||
func TestUIAuthGuardsState(t *testing.T) {
|
||||
app := newTestApp(t)
|
||||
handler := app.Handler()
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/state", nil)
|
||||
resp := httptest.NewRecorder()
|
||||
handler.ServeHTTP(resp, req)
|
||||
if resp.Code != http.StatusForbidden {
|
||||
t.Fatalf("expected forbidden, got %d", resp.Code)
|
||||
}
|
||||
|
||||
req = httptest.NewRequest(http.MethodGet, "/api/state", nil)
|
||||
req.Header.Set("X-Auth-Request-User", "brad")
|
||||
req.Header.Set("X-Auth-Request-Groups", "admin")
|
||||
resp = httptest.NewRecorder()
|
||||
handler.ServeHTTP(resp, req)
|
||||
if resp.Code != http.StatusOK {
|
||||
t.Fatalf("expected ok, got %d: %s", resp.Code, resp.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestInternalSnapshotAndWatch(t *testing.T) {
|
||||
app := newTestApp(t)
|
||||
handler := app.Handler()
|
||||
|
||||
payload := `{"node":"titan-15","collected_at":"2026-03-31T12:00:00Z","snapshot":{"hostname":"titan-15","kernel":"6.6.63","os_image":"Armbian","k3s_version":"v1.31.5+k3s1","containerd":"2.0.0","package_sample":{"containerd":"2.0.0"}}}`
|
||||
req := httptest.NewRequest(http.MethodPost, "/internal/sentinel/snapshot", strings.NewReader(payload))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
resp := httptest.NewRecorder()
|
||||
handler.ServeHTTP(resp, req)
|
||||
if resp.Code != http.StatusOK {
|
||||
t.Fatalf("snapshot failed: %d %s", resp.Code, resp.Body.String())
|
||||
}
|
||||
|
||||
req = httptest.NewRequest(http.MethodPost, "/internal/sentinel/watch", nil)
|
||||
resp = httptest.NewRecorder()
|
||||
handler.ServeHTTP(resp, req)
|
||||
if resp.Code != http.StatusOK {
|
||||
t.Fatalf("watch failed: %d %s", resp.Code, resp.Body.String())
|
||||
}
|
||||
|
||||
var event Event
|
||||
if err := json.Unmarshal(resp.Body.Bytes(), &event); err != nil {
|
||||
t.Fatalf("decode watch response: %v", err)
|
||||
}
|
||||
if event.Kind != "sentinel.watch" {
|
||||
t.Fatalf("unexpected event kind: %s", event.Kind)
|
||||
}
|
||||
|
||||
metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
metricsResp := httptest.NewRecorder()
|
||||
handler.ServeHTTP(metricsResp, metricsReq)
|
||||
body := metricsResp.Body.String()
|
||||
if !strings.Contains(body, `metis_sentinel_snapshots_total{node="titan-15",status="ok"} 1`) {
|
||||
t.Fatalf("missing snapshot metric: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `metis_sentinel_watch_total{status="ok"} 1`) {
|
||||
t.Fatalf("missing watch metric: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func newTestApp(t *testing.T) *App {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
baseImage := filepath.Join(dir, "base.img")
|
||||
if err := os.WriteFile(baseImage, []byte("test-image"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
sum := sha256.Sum256([]byte("test-image"))
|
||||
inventoryPath := filepath.Join(dir, "inventory.yaml")
|
||||
inv := `
|
||||
classes:
|
||||
- name: rpi4
|
||||
arch: arm64
|
||||
os: armbian
|
||||
image: file://` + baseImage + `
|
||||
checksum: sha256:` + hex.EncodeToString(sum[:]) + `
|
||||
k3s_version: v1.31.5+k3s1
|
||||
nodes:
|
||||
- name: titan-15
|
||||
class: rpi4
|
||||
hostname: titan-15
|
||||
ip: 192.168.22.43
|
||||
k3s_role: agent
|
||||
k3s_url: https://192.168.22.7:6443
|
||||
k3s_token: token
|
||||
ssh_user: atlas
|
||||
`
|
||||
if err := os.WriteFile(inventoryPath, []byte(inv), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings := Settings{
|
||||
BindAddr: ":0",
|
||||
InventoryPath: inventoryPath,
|
||||
CacheDir: filepath.Join(dir, "cache"),
|
||||
ArtifactDir: filepath.Join(dir, "artifacts"),
|
||||
HistoryPath: filepath.Join(dir, "history.jsonl"),
|
||||
SnapshotsPath: filepath.Join(dir, "snapshots.json"),
|
||||
TargetsPath: filepath.Join(dir, "targets.json"),
|
||||
DefaultFlashHost: "titan-22",
|
||||
FlashHosts: []string{"titan-22"},
|
||||
LocalHost: "titan-22",
|
||||
AllowedGroups: []string{"admin", "maintainer"},
|
||||
MaxDeviceBytes: 300000000000,
|
||||
}
|
||||
app, err := NewApp(settings)
|
||||
if err != nil {
|
||||
t.Fatalf("new app: %v", err)
|
||||
}
|
||||
if err := app.StoreSnapshot(SnapshotRecord{
|
||||
Node: "titan-17",
|
||||
CollectedAt: time.Now().UTC().Add(-10 * time.Minute),
|
||||
Snapshot: sentinelSnapshot("titan-17", "6.6.63"),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed snapshot: %v", err)
|
||||
}
|
||||
return app
|
||||
}
|
||||
|
||||
func sentinelSnapshot(hostname, kernel string) sentinel.Snapshot {
|
||||
return sentinel.Snapshot{
|
||||
Hostname: hostname,
|
||||
Kernel: kernel,
|
||||
OSImage: "Armbian",
|
||||
K3sVersion: "v1.31.5+k3s1",
|
||||
Containerd: "2.0.0",
|
||||
PackageSample: map[string]string{"containerd": "2.0.0"},
|
||||
}
|
||||
}
|
||||
91
pkg/service/settings.go
Normal file
91
pkg/service/settings.go
Normal file
@ -0,0 +1,91 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Settings configures the Metis service runtime.
|
||||
type Settings struct {
|
||||
BindAddr string
|
||||
InventoryPath string
|
||||
CacheDir string
|
||||
ArtifactDir string
|
||||
HistoryPath string
|
||||
SnapshotsPath string
|
||||
TargetsPath string
|
||||
DefaultFlashHost string
|
||||
FlashHosts []string
|
||||
LocalHost string
|
||||
AllowedUsers []string
|
||||
AllowedGroups []string
|
||||
MaxDeviceBytes int64
|
||||
}
|
||||
|
||||
// FromEnv builds service settings with sensible defaults for local dev and in-cluster use.
|
||||
func FromEnv() Settings {
|
||||
dataDir := getenvDefault("METIS_DATA_DIR", "/var/lib/metis")
|
||||
localHost := getenvDefault("METIS_LOCAL_HOST", hostnameOr("unknown"))
|
||||
defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost)
|
||||
flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost))
|
||||
return Settings{
|
||||
BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"),
|
||||
InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"),
|
||||
CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")),
|
||||
ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")),
|
||||
HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")),
|
||||
SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")),
|
||||
TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")),
|
||||
DefaultFlashHost: defaultFlashHost,
|
||||
FlashHosts: flashHosts,
|
||||
LocalHost: localHost,
|
||||
AllowedUsers: splitList(getenvDefault("METIS_ALLOWED_USERS", "")),
|
||||
AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintainer")),
|
||||
MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000),
|
||||
}
|
||||
}
|
||||
|
||||
func getenvDefault(key, fallback string) string {
|
||||
if value := strings.TrimSpace(os.Getenv(key)); value != "" {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvInt64(key string, fallback int64) int64 {
|
||||
raw := strings.TrimSpace(os.Getenv(key))
|
||||
if raw == "" {
|
||||
return fallback
|
||||
}
|
||||
value, err := strconv.ParseInt(raw, 10, 64)
|
||||
if err != nil {
|
||||
return fallback
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func splitList(raw string) []string {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(raw, ",")
|
||||
result := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
part = strings.TrimSpace(part)
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
result = append(result, part)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func hostnameOr(fallback string) string {
|
||||
name, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(name) == "" {
|
||||
return fallback
|
||||
}
|
||||
return strings.TrimSpace(name)
|
||||
}
|
||||
83
pkg/writer/writer.go
Normal file
83
pkg/writer/writer.go
Normal file
@ -0,0 +1,83 @@
|
||||
package writer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ProgressFunc receives write progress updates.
|
||||
type ProgressFunc func(written int64, total int64)
|
||||
|
||||
// WriteImage writes src into dest using a direct buffered copy so callers can
|
||||
// share the same codepath for files and block devices.
|
||||
func WriteImage(ctx context.Context, src, dest string) error {
|
||||
return WriteImageWithProgress(ctx, src, dest, nil)
|
||||
}
|
||||
|
||||
// WriteImageWithProgress writes src into dest while invoking progress after each chunk.
|
||||
func WriteImageWithProgress(ctx context.Context, src, dest string, progress ProgressFunc) error {
|
||||
if dest == "" {
|
||||
return fmt.Errorf("destination required")
|
||||
}
|
||||
srcInfo, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("source missing: %w", err)
|
||||
}
|
||||
|
||||
return copyFile(ctx, src, dest, srcInfo.Size(), progress)
|
||||
}
|
||||
|
||||
func isDevicePath(path string) bool {
|
||||
return strings.HasPrefix(filepath.Clean(path), "/dev/")
|
||||
}
|
||||
|
||||
func copyFile(ctx context.Context, src, dest string, total int64, progress ProgressFunc) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
out, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
var written int64
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
nr, readErr := in.Read(buf)
|
||||
if nr > 0 {
|
||||
nw, writeErr := out.Write(buf[:nr])
|
||||
if writeErr != nil {
|
||||
return writeErr
|
||||
}
|
||||
if nw != nr {
|
||||
return io.ErrShortWrite
|
||||
}
|
||||
written += int64(nw)
|
||||
if progress != nil {
|
||||
progress(written, total)
|
||||
}
|
||||
}
|
||||
if readErr != nil {
|
||||
if readErr == io.EOF {
|
||||
break
|
||||
}
|
||||
return readErr
|
||||
}
|
||||
}
|
||||
if err := out.Sync(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
28
pkg/writer/writer_test.go
Normal file
28
pkg/writer/writer_test.go
Normal file
@ -0,0 +1,28 @@
|
||||
package writer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteImageCopiesFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
src := filepath.Join(dir, "src.img")
|
||||
dest := filepath.Join(dir, "dest.img")
|
||||
content := []byte("metis-test")
|
||||
if err := os.WriteFile(src, content, 0o644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
if err := WriteImage(context.Background(), src, dest); err != nil {
|
||||
t.Fatalf("write image: %v", err)
|
||||
}
|
||||
got, err := os.ReadFile(dest)
|
||||
if err != nil {
|
||||
t.Fatalf("read dest: %v", err)
|
||||
}
|
||||
if string(got) != string(content) {
|
||||
t.Fatalf("expected %q got %q", string(content), string(got))
|
||||
}
|
||||
}
|
||||
BIN
scripts/__pycache__/publish_test_metrics.cpython-314.pyc
Normal file
BIN
scripts/__pycache__/publish_test_metrics.cpython-314.pyc
Normal file
Binary file not shown.
73
scripts/prepare_titan_rpi4_replacement.sh
Executable file
73
scripts/prepare_titan_rpi4_replacement.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: prepare_titan_rpi4_replacement.sh <node> [remote-host]
|
||||
|
||||
Build a node-specific recovery image for a Titan rpi4 Longhorn worker and
|
||||
optionally copy it to a remote flashing station such as `tethys`.
|
||||
|
||||
Examples:
|
||||
./scripts/prepare_titan_rpi4_replacement.sh titan-13
|
||||
./scripts/prepare_titan_rpi4_replacement.sh titan-19 tethys
|
||||
EOF
|
||||
}
|
||||
|
||||
if [ "${1:-}" = "" ] || [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
node="$1"
|
||||
remote_host="${2:-}"
|
||||
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cache_dir="${METIS_CACHE_DIR:-${HOME}/.cache/metis}"
|
||||
remote_dir="${METIS_REMOTE_DIR:-/tmp/metis-images}"
|
||||
|
||||
case "${node}" in
|
||||
titan-13|titan-19)
|
||||
;;
|
||||
*)
|
||||
echo "Refusing unknown replacement target: ${node}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
cd "${repo_root}"
|
||||
|
||||
if [ -z "${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN:-}" ]; then
|
||||
export METIS_IMAGE_RPI4_ARMBIAN_LONGHORN="file://${HOME}/Downloads/Armbian_25.8.1_Rpi4b_noble_current_6.12.41.img"
|
||||
fi
|
||||
|
||||
if [ -z "${METIS_K3S_TOKEN:-}" ]; then
|
||||
export METIS_K3S_TOKEN="$(ssh titan-0a 'sudo cat /var/lib/rancher/k3s/server/node-token')"
|
||||
fi
|
||||
|
||||
echo "Deleting stale Kubernetes node object for ${node}..."
|
||||
kubectl delete node "${node}" --ignore-not-found
|
||||
|
||||
echo "Building recovery image for ${node}..."
|
||||
go run ./cmd/metis image \
|
||||
--inventory inventory.titan-rpi4.yaml \
|
||||
--node "${node}" \
|
||||
--cache "${cache_dir}" \
|
||||
--output "artifacts/${node}.img"
|
||||
|
||||
sha256sum "artifacts/${node}.img"
|
||||
|
||||
if [ -n "${remote_host}" ]; then
|
||||
echo "Copying artifacts/${node}.img to ${remote_host}:${remote_dir}/ ..."
|
||||
ssh "${remote_host}" "mkdir -p '${remote_dir}'"
|
||||
scp "artifacts/${node}.img" "${remote_host}:${remote_dir}/${node}.img"
|
||||
fi
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Prepared artifacts/${node}.img
|
||||
|
||||
Next steps:
|
||||
1. Ask for the SD card to be inserted into the flashing station.
|
||||
2. Run ./scripts/remote_sd_candidates.sh ${remote_host:-tethys}
|
||||
3. Run ./scripts/remote_flash_titan_image.sh ${remote_host:-tethys} ${node} /dev/sdX
|
||||
EOF
|
||||
115
scripts/publish_test_metrics.py
Normal file
115
scripts/publish_test_metrics.py
Normal file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
def _escape_label(value: str) -> str:
|
||||
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
|
||||
|
||||
|
||||
def _label_str(labels: dict[str, str]) -> str:
|
||||
parts = [f'{key}="{_escape_label(val)}"' for key, val in labels.items() if val]
|
||||
return "{" + ",".join(parts) + "}" if parts else ""
|
||||
|
||||
|
||||
def _load_coverage(path: str) -> float:
|
||||
with open(path, "r", encoding="utf-8") as handle:
|
||||
payload = json.load(handle)
|
||||
summary = payload.get("summary") or {}
|
||||
percent = summary.get("percent_covered")
|
||||
if isinstance(percent, (int, float)):
|
||||
return float(percent)
|
||||
raise RuntimeError("coverage summary missing percent_covered")
|
||||
|
||||
|
||||
def _load_junit(path: str) -> dict[str, int]:
|
||||
tree = ET.parse(path)
|
||||
root = tree.getroot()
|
||||
|
||||
def _as_int(node, name: str) -> int:
|
||||
raw = node.attrib.get(name) or "0"
|
||||
try:
|
||||
return int(float(raw))
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
suites = []
|
||||
if root.tag == "testsuite":
|
||||
suites = [root]
|
||||
elif root.tag == "testsuites":
|
||||
suites = list(root.findall("testsuite"))
|
||||
|
||||
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
|
||||
for suite in suites:
|
||||
totals["tests"] += _as_int(suite, "tests")
|
||||
totals["failures"] += _as_int(suite, "failures")
|
||||
totals["errors"] += _as_int(suite, "errors")
|
||||
totals["skipped"] += _as_int(suite, "skipped")
|
||||
return totals
|
||||
|
||||
|
||||
def _post_metrics(url: str, payload: str) -> None:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=payload.encode("utf-8"),
|
||||
method="POST",
|
||||
headers={"Content-Type": "text/plain"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
if resp.status >= 400:
|
||||
raise RuntimeError(f"metrics push failed status={resp.status}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
vm_url = os.getenv("VM_IMPORT_URL", "").strip()
|
||||
if not vm_url:
|
||||
print("VM_IMPORT_URL not set; skipping metrics push")
|
||||
return 0
|
||||
|
||||
coverage_path = os.getenv("COVERAGE_JSON", "build/coverage.json")
|
||||
junit_path = os.getenv("JUNIT_XML", "build/junit.xml")
|
||||
|
||||
if not os.path.exists(coverage_path):
|
||||
raise RuntimeError(f"missing coverage file {coverage_path}")
|
||||
if not os.path.exists(junit_path):
|
||||
raise RuntimeError(f"missing junit file {junit_path}")
|
||||
|
||||
coverage = _load_coverage(coverage_path)
|
||||
totals = _load_junit(junit_path)
|
||||
passed = max(totals["tests"] - totals["failures"] - totals["errors"] - totals["skipped"], 0)
|
||||
|
||||
labels = {
|
||||
"job": os.getenv("CI_JOB_NAME", "metis"),
|
||||
"branch": os.getenv("BRANCH_NAME", ""),
|
||||
"build_number": os.getenv("BUILD_NUMBER", ""),
|
||||
"commit": os.getenv("GIT_COMMIT", ""),
|
||||
"repo": os.getenv("REPO_NAME", "metis"),
|
||||
}
|
||||
|
||||
prefix = os.getenv("METRICS_PREFIX", "ariadne_ci")
|
||||
lines = [
|
||||
f"{prefix}_coverage_percent{_label_str(labels)} {coverage:.3f}",
|
||||
f"{prefix}_tests_total{_label_str({**labels, 'result': 'passed'})} {passed}",
|
||||
f"{prefix}_tests_total{_label_str({**labels, 'result': 'failed'})} {totals['failures']}",
|
||||
f"{prefix}_tests_total{_label_str({**labels, 'result': 'error'})} {totals['errors']}",
|
||||
f"{prefix}_tests_total{_label_str({**labels, 'result': 'skipped'})} {totals['skipped']}",
|
||||
f"{prefix}_build_info{_label_str(labels)} 1",
|
||||
]
|
||||
|
||||
payload = "\n".join(lines) + "\n"
|
||||
_post_metrics(vm_url, payload)
|
||||
print("metrics push complete")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
sys.exit(main())
|
||||
except Exception as exc:
|
||||
print(f"metrics push failed: {exc}")
|
||||
sys.exit(1)
|
||||
86
scripts/remote_flash_titan_image.sh
Executable file
86
scripts/remote_flash_titan_image.sh
Executable file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: remote_flash_titan_image.sh <remote-host> <node> <device>
|
||||
|
||||
Copy a prepared Titan replacement image to a remote flashing station and write it
|
||||
to the specified removable block device.
|
||||
|
||||
Example:
|
||||
./scripts/remote_flash_titan_image.sh tethys titan-13 /dev/sdd
|
||||
EOF
|
||||
}
|
||||
|
||||
if [ "${3:-}" = "" ] || [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
remote_host="$1"
|
||||
node="$2"
|
||||
device="$3"
|
||||
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
remote_dir="${METIS_REMOTE_DIR:-/tmp/metis-images}"
|
||||
image_path="${repo_root}/artifacts/${node}.img"
|
||||
max_bytes="${METIS_SD_MAX_BYTES:-300000000000}"
|
||||
|
||||
if [ ! -f "${image_path}" ]; then
|
||||
echo "Missing local image: ${image_path}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "${device}" in
|
||||
/dev/sd*|/dev/mmcblk*|/dev/nvme*n1)
|
||||
;;
|
||||
*)
|
||||
echo "Refusing suspicious device path: ${device}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
device_info="$(ssh "${remote_host}" "lsblk -b -dn -o NAME,TRAN,RM,HOTPLUG,SIZE '${device}' 2>/dev/null" || true)"
|
||||
if [ -z "${device_info}" ]; then
|
||||
echo "Could not inspect remote device ${device} on ${remote_host}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
read -r remote_name remote_tran remote_rm remote_hotplug remote_size <<<"${device_info}"
|
||||
if [ "/dev/${remote_name}" != "${device}" ]; then
|
||||
echo "Remote device mismatch: expected ${device}, got /dev/${remote_name}" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ "${remote_size}" -gt "${max_bytes}" ]; then
|
||||
echo "Refusing to flash ${device}: size ${remote_size} is larger than ${max_bytes} bytes" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ "${remote_tran}" != "usb" ] && [ "${remote_rm}" != "1" ] && [ "${remote_hotplug}" != "1" ]; then
|
||||
echo "Refusing to flash ${device}: not detected as removable/hotplug media (${device_info})" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Copying ${image_path} to ${remote_host}:${remote_dir}/${node}.img ..."
|
||||
ssh "${remote_host}" "mkdir -p '${remote_dir}'"
|
||||
scp "${image_path}" "${remote_host}:${remote_dir}/${node}.img"
|
||||
|
||||
local_sha="$(sha256sum "${image_path}" | awk '{print $1}')"
|
||||
remote_sha="$(ssh "${remote_host}" "sha256sum '${remote_dir}/${node}.img' | awk '{print \\$1}'")"
|
||||
if [ "${local_sha}" != "${remote_sha}" ]; then
|
||||
echo "Checksum mismatch after copy: local=${local_sha} remote=${remote_sha}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "About to flash ${node}.img to ${device} on ${remote_host}."
|
||||
echo "You will be prompted for the remote sudo password."
|
||||
ssh -t "${remote_host}" "lsblk '${device}' && sudo dd if='${remote_dir}/${node}.img' of='${device}' bs=4M conv=fsync status=progress && sync && sudo blockdev --flushbufs '${device}'"
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Flash complete for ${node} on ${remote_host}:${device}
|
||||
|
||||
Next steps:
|
||||
1. Tell your helper to remove the flashed card and swap it into ${node}.
|
||||
2. Tell them to restore power to the Pi.
|
||||
3. Watch the node with: kubectl get nodes -w
|
||||
EOF
|
||||
12
scripts/remote_sd_candidates.sh
Executable file
12
scripts/remote_sd_candidates.sh
Executable file
@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
remote_host="${1:-tethys}"
|
||||
max_bytes="${METIS_SD_MAX_BYTES:-300000000000}"
|
||||
|
||||
ssh "${remote_host}" "lsblk -S -b -dn -o NAME,TRAN,RM,HOTPLUG,SIZE,MODEL,SERIAL | while read -r name tran rm hotplug size model serial; do
|
||||
if [ \"\${tran}\" = usb ] && [ \"\${hotplug}\" = 1 ] && [ \"\${size}\" -le ${max_bytes} ]; then
|
||||
human=\$(numfmt --to=iec --suffix=B \"\${size}\" 2>/dev/null || printf '%sB' \"\${size}\")
|
||||
printf '/dev/%s\t%s\t%s\t%s\n' \"\${name}\" \"\${human}\" \"\${model}\" \"\${serial}\"
|
||||
fi
|
||||
done"
|
||||
Loading…
x
Reference in New Issue
Block a user