From e8a580ee57f3f9abf29734e73f42fd7c8fd22d75 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 31 Mar 2026 13:54:58 -0300 Subject: [PATCH] longhorn: reconcile astreae and asteria disk tags --- .../longhorn/core/kustomization.yaml | 4 + .../core/longhorn-disk-tags-ensure-job.yaml | 36 +++++++ .../core/scripts/longhorn_disk_tags_ensure.py | 100 ++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml create mode 100644 infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py diff --git a/infrastructure/longhorn/core/kustomization.yaml b/infrastructure/longhorn/core/kustomization.yaml index deb5308b..6b0c572e 100644 --- a/infrastructure/longhorn/core/kustomization.yaml +++ b/infrastructure/longhorn/core/kustomization.yaml @@ -8,11 +8,15 @@ resources: - vault-sync-deployment.yaml - helmrelease.yaml - longhorn-settings-ensure-job.yaml + - longhorn-disk-tags-ensure-job.yaml configMapGenerator: - name: longhorn-settings-ensure-script files: - longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh + - name: longhorn-disk-tags-ensure-script + files: + - longhorn_disk_tags_ensure.py=scripts/longhorn_disk_tags_ensure.py generatorOptions: disableNameSuffixHash: true diff --git a/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml b/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml new file mode 100644 index 00000000..ec0bf098 --- /dev/null +++ b/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml @@ -0,0 +1,36 @@ +# infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: longhorn-disk-tags-ensure-1 + namespace: longhorn-system +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: longhorn-service-account + restartPolicy: Never + volumes: + - name: longhorn-disk-tags-ensure-script + configMap: + name: longhorn-disk-tags-ensure-script + defaultMode: 0555 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + - key: node-role.kubernetes.io/worker + operator: Exists + containers: + - name: apply + image: python:3.12.9-alpine3.20 + command: ["python", "/scripts/longhorn_disk_tags_ensure.py"] + volumeMounts: + - name: longhorn-disk-tags-ensure-script + mountPath: /scripts + readOnly: true diff --git a/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py b/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py new file mode 100644 index 00000000..48a41a51 --- /dev/null +++ b/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Reconcile Longhorn disk tags for the Titan longhorn storage classes. + +The astreae/asteria storageclasses select Longhorn disks by tag. The current +nodes already have the right disk paths, but the tag fields can drift to empty +after node recovery. This job patches the live Longhorn Node CRs back to the +expected tags so PVC provisioning keeps working. +""" + +from __future__ import annotations + +import json +import os +import ssl +import urllib.request + + +LONGHORN_NS = "longhorn-system" +LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes" +DESIRED_TAGS = { + "/mnt/astreae": "astreae", + "/mnt/asteria": "asteria", +} + + +def api_base() -> str: + host = os.environ.get("KUBERNETES_SERVICE_HOST") + port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") + if not host: + raise SystemExit("missing KUBERNETES_SERVICE_HOST") + return f"https://{host}:{port}" + + +def token() -> str: + path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + with open(path, "r", encoding="utf-8") as fh: + return fh.read().strip() + + +def ca_context() -> ssl.SSLContext: + cafile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + return ssl.create_default_context(cafile=cafile) + + +def request_json(method: str, path: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{api_base()}{path}", + method=method, + headers={ + "Authorization": f"Bearer {token()}", + "Content-Type": "application/merge-patch+json", + "Accept": "application/json", + }, + data=None if body is None else json.dumps(body).encode("utf-8"), + ) + with urllib.request.urlopen(req, context=ca_context(), timeout=20) as resp: + payload = resp.read() + return json.loads(payload) if payload else {} + + +def list_nodes() -> list[dict]: + data = request_json("GET", LONGHORN_API.format(namespace=LONGHORN_NS)) + return data.get("items", []) + + +def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None: + body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}} + request_json( + "PATCH", + f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}", + body=body, + ) + + +def main() -> int: + changed = 0 + skipped = 0 + + for node in list_nodes(): + name = node.get("metadata", {}).get("name", "") + spec_disks = node.get("spec", {}).get("disks", {}) or {} + for disk_name, disk in spec_disks.items(): + disk_path = disk.get("path") + desired_tag = DESIRED_TAGS.get(disk_path) + if not desired_tag: + continue + current_tags = disk.get("tags") or [] + if current_tags == [desired_tag]: + skipped += 1 + continue + print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}") + patch_disk_tags(name, disk_name, desired_tag) + changed += 1 + + print(f"done: changed={changed} skipped={skipped}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())