Compare commits

..

No commits in common. "8e39c6a28b2cf2b8d2e8661f402144fc9fe1a473" and "29d22ba53909e9c2bab39e109bf7fcb5d4becfa9" have entirely different histories.

10 changed files with 127 additions and 424 deletions

View File

@ -0,0 +1,5 @@
# Oceanus Cluster Scaffold
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.

View File

@ -2,14 +2,15 @@
| Hostname | Role / Function | Managed By | Notes | | Hostname | Role / Function | Managed By | Notes |
|------------|--------------------------------|---------------------|-------| |------------|--------------------------------|---------------------|-------|
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only | | titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node | | titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node | | titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware | | titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-20&21| NVIDIA Jetson workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` | | titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
| titan-23 | Dedicated SUI validator Oceanus| Manual + Ansible | Baremetal validator workloads, exposes metrics to atlas |
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible | | titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
| titan-jh | Jumphost & bastion & lesavka | Ansible | Entry point / future KVM services / custom kvm - lesavaka | | titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` | | styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.

View File

@ -81,7 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] GAUGE_WIDTHS = [5, 5, 5, 5, 4]
CONTROL_WORKLOADS_EXPR = ( CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
) )
@ -198,18 +198,6 @@ STUCK_TERMINATING_EXPR = (
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
'))' '))'
) )
UPTIME_WINDOW = "30d"
UPTIME_AVG_EXPR = f"avg(avg_over_time(up[{UPTIME_WINDOW}]))"
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
PROBLEM_TABLE_EXPR = ( PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) " "(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(node) kube_pod_info "
@ -567,88 +555,47 @@ def link_to(uid):
def build_overview(): def build_overview():
panels = [] panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
row1_stats = [ row1_stats = [
{ (
"id": 1, 1,
"title": "Workers Ready", "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge", WORKER_SUFFIX,
"max_value": WORKER_TOTAL, WORKER_TOTAL,
"thresholds": { None,
"mode": "absolute", ),
"steps": [ (
{"color": "red", "value": None}, 2,
{"color": "orange", "value": WORKER_TOTAL - 2}, "Control Plane Ready",
{"color": "yellow", "value": WORKER_TOTAL - 1}, f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
{"color": "green", "value": WORKER_TOTAL}, CONTROL_SUFFIX,
], CONTROL_TOTAL,
}, None,
}, ),
{ (
"id": 2, 3,
"title": "Control Plane Ready", "Control Plane Workloads",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', CONTROL_WORKLOADS_EXPR,
"kind": "gauge", None,
"max_value": CONTROL_TOTAL, 4,
"thresholds": { link_to("atlas-pods"),
"mode": "absolute", ),
"steps": [ (
{"color": "red", "value": None}, 4,
{"color": "green", "value": CONTROL_TOTAL}, "Problem Pods",
], PROBLEM_PODS_EXPR,
}, None,
}, 1,
{ link_to("atlas-pods"),
"id": 3, ),
"title": "Control Plane Workloads", (
"expr": CONTROL_WORKLOADS_EXPR, 5,
"kind": "stat", "Stuck Terminating",
"thresholds": count_thresholds, STUCK_TERMINATING_EXPR,
"links": link_to("atlas-pods"), None,
}, 1,
{ link_to("atlas-pods"),
"id": 27, ),
"title": "Atlas Uptime (30d, 9s)",
"expr": UPTIME_NINES_EXPR,
"kind": "stat",
"thresholds": UPTIME_THRESHOLDS,
"value_suffix": " 9s",
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"links": link_to("atlas-pods"),
},
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"links": link_to("atlas-pods"),
},
] ]
def gauge_grid(idx): def gauge_grid(idx):
@ -656,36 +603,74 @@ def build_overview():
x = sum(GAUGE_WIDTHS[:idx]) x = sum(GAUGE_WIDTHS[:idx])
return width, x return width, x
for idx, item in enumerate(row1_stats): for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
panel_id = item["id"] thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
}
elif panel_id in (3, 4, 5):
max_value = 4
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
else:
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
}
width, x = gauge_grid(idx) width, x = gauge_grid(idx)
grid = {"h": 5, "w": width, "x": x, "y": 0} if panel_id in (3, 4, 5):
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append( panels.append(
stat_panel( stat_panel(
panel_id, panel_id,
item["title"], title,
item["expr"], expr,
grid, {"h": 5, "w": width, "x": x, "y": 0},
thresholds=item.get("thresholds"), thresholds=thresholds,
legend=None, legend=None,
links=item.get("links"), links=links,
text_mode=item.get("text_mode", "value"), text_mode="value",
value_suffix=item.get("value_suffix"),
) )
) )
else: else:
panels.append( panels.append(
gauge_panel( gauge_panel(
panel_id, panel_id,
item["title"], title,
item["expr"], expr,
grid, {"h": 5, "w": width, "x": x, "y": 0},
min_value=0, min_value=min_value,
max_value=item.get("max_value", 5), max_value=max_value,
thresholds=item.get("thresholds"), thresholds=thresholds,
links=item.get("links"), links=links,
) )
) )

View File

@ -5,7 +5,7 @@ metadata:
name: gitea-ingress name: gitea-ingress
namespace: gitea namespace: gitea
annotations: annotations:
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec: spec:
tls: tls:

View File

@ -5,7 +5,7 @@ metadata:
name: jitsi name: jitsi
namespace: jitsi namespace: jitsi
annotations: annotations:
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec: spec:
ingressClassName: traefik ingressClassName: traefik
tls: tls:

View File

@ -17,7 +17,7 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 4, "w": 5,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },
@ -78,8 +78,8 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 4, "x": 5,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -131,8 +131,8 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 7, "x": 10,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -196,75 +196,6 @@
} }
] ]
}, },
{
"id": 27,
"type": "stat",
"title": "Atlas Uptime (30d, 9s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 2
},
{
"color": "yellow",
"value": 3
},
{
"color": "green",
"value": 3.5
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": " 9s"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{ {
"id": 4, "id": 4,
"type": "stat", "type": "stat",
@ -275,8 +206,8 @@
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 14, "x": 15,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -340,81 +271,6 @@
} }
] ]
}, },
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{ {
"id": 5, "id": 5,
"type": "stat", "type": "stat",

View File

@ -26,7 +26,7 @@ data:
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 4, "w": 5,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },
@ -87,8 +87,8 @@ data:
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 4, "x": 5,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -140,8 +140,8 @@ data:
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 7, "x": 10,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -205,75 +205,6 @@ data:
} }
] ]
}, },
{
"id": 27,
"type": "stat",
"title": "Atlas Uptime (30d, 9s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 2
},
{
"color": "yellow",
"value": 3
},
{
"color": "green",
"value": 3.5
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": " 9s"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{ {
"id": 4, "id": 4,
"type": "stat", "type": "stat",
@ -284,8 +215,8 @@ data:
}, },
"gridPos": { "gridPos": {
"h": 5, "h": 5,
"w": 3, "w": 5,
"x": 14, "x": 15,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
@ -349,81 +280,6 @@ data:
} }
] ]
}, },
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{ {
"id": 5, "id": 5,
"type": "stat", "type": "stat",

View File

@ -8,7 +8,7 @@ metadata:
kubernetes.io/ingress.class: traefik kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true" traefik.ingress.kubernetes.io/router.tls: "true"
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: letsencrypt-prod
spec: spec:
tls: tls:
- hosts: [ "pegasus.bstein.dev" ] - hosts: [ "pegasus.bstein.dev" ]

View File

@ -8,7 +8,7 @@ spec:
secretName: vault-server-tls secretName: vault-server-tls
issuerRef: issuerRef:
kind: ClusterIssuer kind: ClusterIssuer
name: letsencrypt name: letsencrypt-prod
commonName: secret.bstein.dev commonName: secret.bstein.dev
dnsNames: dnsNames:
- secret.bstein.dev - secret.bstein.dev

View File

@ -5,7 +5,7 @@ metadata:
name: zot name: zot
namespace: zot namespace: zot
annotations: annotations:
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: letsencrypt-prod
traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true" traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd