Compare commits
2 Commits
29d22ba539
...
8e39c6a28b
| Author | SHA1 | Date | |
|---|---|---|---|
| 8e39c6a28b | |||
| 38ab8e3364 |
@ -1,5 +0,0 @@
|
|||||||
# Oceanus Cluster Scaffold
|
|
||||||
|
|
||||||
This directory prepares the Flux and Kustomize layout for a future Oceanus-managed cluster.
|
|
||||||
Populate `flux-system/` with `gotk-components.yaml` and related manifests after running `flux bootstrap`.
|
|
||||||
Define node-specific resources under `infrastructure/modules/profiles/oceanus-validator/` and reference workloads in `applications/` as they come online.
|
|
||||||
@ -2,15 +2,14 @@
|
|||||||
|
|
||||||
| Hostname | Role / Function | Managed By | Notes |
|
| Hostname | Role / Function | Managed By | Notes |
|
||||||
|------------|--------------------------------|---------------------|-------|
|
|------------|--------------------------------|---------------------|-------|
|
||||||
|
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
|
||||||
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
|
| titan-0a | Kubernetes control-plane | Flux (atlas cluster)| HA leader, tainted for control only |
|
||||||
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
|
| titan-0b | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
|
||||||
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
|
| titan-0c | Kubernetes control-plane | Flux (atlas cluster)| Standby control node |
|
||||||
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
|
| titan-04-19| Raspberry Pi workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
|
||||||
|
| titan-20&21| NVIDIA Jetson workers | Flux (atlas cluster)| Workload nodes, labelled per hardware |
|
||||||
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
|
| titan-22 | GPU mini-PC (Jellyfin) | Flux + Ansible | NVIDIA runtime managed via `modules/profiles/atlas-ha` |
|
||||||
|
| titan-23 | Dedicated SUI validator Oceanus| Manual + Ansible | Baremetal validator workloads, exposes metrics to atlas |
|
||||||
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
|
| titan-24 | Tethys hybrid node | Flux + Ansible | Runs SUI metrics via K8s, validator via Ansible |
|
||||||
| titan-db | HA control plane database | Ansible | PostgreSQL / etcd backing services |
|
| titan-jh | Jumphost & bastion & lesavka | Ansible | Entry point / future KVM services / custom kvm - lesavaka |
|
||||||
| titan-jh | Jumphost & bastion | Ansible | Entry point / future KVM services |
|
|
||||||
| oceanus | Dedicated SUI validator host | Ansible / Flux prep | Baremetal validator workloads, exposes metrics to atlas; Kustomize scaffold under `clusters/oceanus/` |
|
|
||||||
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
|
| styx | Air-gapped workstation | Manual / Scripts | Remains isolated, scripts tracked in `hosts/styx` |
|
||||||
|
|
||||||
Use the `clusters/` directory for cluster-scoped state and the `hosts/` directory for baremetal orchestration.
|
|
||||||
|
|||||||
@ -81,7 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
|||||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||||
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
|
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
|
||||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||||
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
|
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
||||||
CONTROL_WORKLOADS_EXPR = (
|
CONTROL_WORKLOADS_EXPR = (
|
||||||
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
|
||||||
)
|
)
|
||||||
@ -198,6 +198,18 @@ STUCK_TERMINATING_EXPR = (
|
|||||||
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
||||||
'))'
|
'))'
|
||||||
)
|
)
|
||||||
|
UPTIME_WINDOW = "30d"
|
||||||
|
UPTIME_AVG_EXPR = f"avg(avg_over_time(up[{UPTIME_WINDOW}]))"
|
||||||
|
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999))"
|
||||||
|
UPTIME_THRESHOLDS = {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": None},
|
||||||
|
{"color": "orange", "value": 2},
|
||||||
|
{"color": "yellow", "value": 3},
|
||||||
|
{"color": "green", "value": 3.5},
|
||||||
|
],
|
||||||
|
}
|
||||||
PROBLEM_TABLE_EXPR = (
|
PROBLEM_TABLE_EXPR = (
|
||||||
"(time() - kube_pod_created{pod!=\"\"}) "
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
||||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
||||||
@ -555,61 +567,24 @@ def link_to(uid):
|
|||||||
def build_overview():
|
def build_overview():
|
||||||
panels = []
|
panels = []
|
||||||
|
|
||||||
|
count_thresholds = {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": None},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "orange", "value": 2},
|
||||||
|
{"color": "red", "value": 3},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
row1_stats = [
|
row1_stats = [
|
||||||
(
|
{
|
||||||
1,
|
"id": 1,
|
||||||
"Workers Ready",
|
"title": "Workers Ready",
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
||||||
WORKER_SUFFIX,
|
"kind": "gauge",
|
||||||
WORKER_TOTAL,
|
"max_value": WORKER_TOTAL,
|
||||||
None,
|
"thresholds": {
|
||||||
),
|
|
||||||
(
|
|
||||||
2,
|
|
||||||
"Control Plane Ready",
|
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
||||||
CONTROL_SUFFIX,
|
|
||||||
CONTROL_TOTAL,
|
|
||||||
None,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
"Control Plane Workloads",
|
|
||||||
CONTROL_WORKLOADS_EXPR,
|
|
||||||
None,
|
|
||||||
4,
|
|
||||||
link_to("atlas-pods"),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
"Problem Pods",
|
|
||||||
PROBLEM_PODS_EXPR,
|
|
||||||
None,
|
|
||||||
1,
|
|
||||||
link_to("atlas-pods"),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
"Stuck Terminating",
|
|
||||||
STUCK_TERMINATING_EXPR,
|
|
||||||
None,
|
|
||||||
1,
|
|
||||||
link_to("atlas-pods"),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def gauge_grid(idx):
|
|
||||||
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
|
|
||||||
x = sum(GAUGE_WIDTHS[:idx])
|
|
||||||
return width, x
|
|
||||||
|
|
||||||
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
|
||||||
thresholds = None
|
|
||||||
min_value = 0
|
|
||||||
max_value = ok_value or 5
|
|
||||||
if panel_id == 1:
|
|
||||||
max_value = WORKER_TOTAL
|
|
||||||
thresholds = {
|
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": None},
|
{"color": "red", "value": None},
|
||||||
@ -617,60 +592,100 @@ def build_overview():
|
|||||||
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
||||||
{"color": "green", "value": WORKER_TOTAL},
|
{"color": "green", "value": WORKER_TOTAL},
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
elif panel_id == 2:
|
},
|
||||||
max_value = CONTROL_TOTAL
|
{
|
||||||
thresholds = {
|
"id": 2,
|
||||||
|
"title": "Control Plane Ready",
|
||||||
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
||||||
|
"kind": "gauge",
|
||||||
|
"max_value": CONTROL_TOTAL,
|
||||||
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": None},
|
{"color": "red", "value": None},
|
||||||
{"color": "green", "value": CONTROL_TOTAL},
|
{"color": "green", "value": CONTROL_TOTAL},
|
||||||
],
|
],
|
||||||
}
|
},
|
||||||
elif panel_id in (3, 4, 5):
|
},
|
||||||
max_value = 4
|
{
|
||||||
thresholds = {
|
"id": 3,
|
||||||
"mode": "absolute",
|
"title": "Control Plane Workloads",
|
||||||
"steps": [
|
"expr": CONTROL_WORKLOADS_EXPR,
|
||||||
{"color": "green", "value": None},
|
"kind": "stat",
|
||||||
{"color": "yellow", "value": 1},
|
"thresholds": count_thresholds,
|
||||||
{"color": "orange", "value": 2},
|
"links": link_to("atlas-pods"),
|
||||||
{"color": "red", "value": 3},
|
},
|
||||||
],
|
{
|
||||||
}
|
"id": 27,
|
||||||
else:
|
"title": "Atlas Uptime (30d, 9s)",
|
||||||
thresholds = {
|
"expr": UPTIME_NINES_EXPR,
|
||||||
"mode": "absolute",
|
"kind": "stat",
|
||||||
"steps": [
|
"thresholds": UPTIME_THRESHOLDS,
|
||||||
{"color": "green", "value": None},
|
"value_suffix": " 9s",
|
||||||
{"color": "red", "value": max_value},
|
"text_mode": "value",
|
||||||
],
|
},
|
||||||
}
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Problem Pods",
|
||||||
|
"expr": PROBLEM_PODS_EXPR,
|
||||||
|
"kind": "stat",
|
||||||
|
"thresholds": count_thresholds,
|
||||||
|
"links": link_to("atlas-pods"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "CrashLoop / ImagePull",
|
||||||
|
"expr": CRASHLOOP_EXPR,
|
||||||
|
"kind": "stat",
|
||||||
|
"thresholds": count_thresholds,
|
||||||
|
"links": link_to("atlas-pods"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Stuck Terminating",
|
||||||
|
"expr": STUCK_TERMINATING_EXPR,
|
||||||
|
"kind": "stat",
|
||||||
|
"thresholds": count_thresholds,
|
||||||
|
"links": link_to("atlas-pods"),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def gauge_grid(idx):
|
||||||
|
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
|
||||||
|
x = sum(GAUGE_WIDTHS[:idx])
|
||||||
|
return width, x
|
||||||
|
|
||||||
|
for idx, item in enumerate(row1_stats):
|
||||||
|
panel_id = item["id"]
|
||||||
width, x = gauge_grid(idx)
|
width, x = gauge_grid(idx)
|
||||||
if panel_id in (3, 4, 5):
|
grid = {"h": 5, "w": width, "x": x, "y": 0}
|
||||||
|
kind = item.get("kind", "gauge")
|
||||||
|
if kind == "stat":
|
||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
panel_id,
|
panel_id,
|
||||||
title,
|
item["title"],
|
||||||
expr,
|
item["expr"],
|
||||||
{"h": 5, "w": width, "x": x, "y": 0},
|
grid,
|
||||||
thresholds=thresholds,
|
thresholds=item.get("thresholds"),
|
||||||
legend=None,
|
legend=None,
|
||||||
links=links,
|
links=item.get("links"),
|
||||||
text_mode="value",
|
text_mode=item.get("text_mode", "value"),
|
||||||
|
value_suffix=item.get("value_suffix"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
panels.append(
|
panels.append(
|
||||||
gauge_panel(
|
gauge_panel(
|
||||||
panel_id,
|
panel_id,
|
||||||
title,
|
item["title"],
|
||||||
expr,
|
item["expr"],
|
||||||
{"h": 5, "w": width, "x": x, "y": 0},
|
grid,
|
||||||
min_value=min_value,
|
min_value=0,
|
||||||
max_value=max_value,
|
max_value=item.get("max_value", 5),
|
||||||
thresholds=thresholds,
|
thresholds=item.get("thresholds"),
|
||||||
links=links,
|
links=item.get("links"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@ metadata:
|
|||||||
name: gitea-ingress
|
name: gitea-ingress
|
||||||
namespace: gitea
|
namespace: gitea
|
||||||
annotations:
|
annotations:
|
||||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||||
spec:
|
spec:
|
||||||
tls:
|
tls:
|
||||||
|
|||||||
@ -5,7 +5,7 @@ metadata:
|
|||||||
name: jitsi
|
name: jitsi
|
||||||
namespace: jitsi
|
namespace: jitsi
|
||||||
annotations:
|
annotations:
|
||||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
spec:
|
spec:
|
||||||
ingressClassName: traefik
|
ingressClassName: traefik
|
||||||
tls:
|
tls:
|
||||||
|
|||||||
@ -17,7 +17,7 @@
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 4,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
@ -78,8 +78,8 @@
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 5,
|
"x": 4,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -131,8 +131,8 @@
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 10,
|
"x": 7,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -196,6 +196,75 @@
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": 27,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Atlas Uptime (30d, 9s)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 5,
|
||||||
|
"w": 4,
|
||||||
|
"x": 10,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": 3.5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto",
|
||||||
|
"valueSuffix": " 9s"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
@ -206,8 +275,8 @@
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 15,
|
"x": 14,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -271,6 +340,81 @@
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "CrashLoop / ImagePull",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 5,
|
||||||
|
"w": 3,
|
||||||
|
"x": 17,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
},
|
||||||
|
"links": [
|
||||||
|
{
|
||||||
|
"title": "Open atlas-pods dashboard",
|
||||||
|
"url": "/d/atlas-pods",
|
||||||
|
"targetBlank": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
|
|||||||
@ -26,7 +26,7 @@ data:
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 4,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
@ -87,8 +87,8 @@ data:
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 5,
|
"x": 4,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -140,8 +140,8 @@ data:
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 10,
|
"x": 7,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -205,6 +205,75 @@ data:
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": 27,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Atlas Uptime (30d, 9s)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 5,
|
||||||
|
"w": 4,
|
||||||
|
"x": 10,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": 3.5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto",
|
||||||
|
"valueSuffix": " 9s"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
@ -215,8 +284,8 @@ data:
|
|||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 5,
|
"w": 3,
|
||||||
"x": 15,
|
"x": 14,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
@ -280,6 +349,81 @@ data:
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "CrashLoop / ImagePull",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 5,
|
||||||
|
"w": 3,
|
||||||
|
"x": 17,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "orange",
|
||||||
|
"value": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
},
|
||||||
|
"links": [
|
||||||
|
{
|
||||||
|
"title": "Open atlas-pods dashboard",
|
||||||
|
"url": "/d/atlas-pods",
|
||||||
|
"targetBlank": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
|
|||||||
@ -8,7 +8,7 @@ metadata:
|
|||||||
kubernetes.io/ingress.class: traefik
|
kubernetes.io/ingress.class: traefik
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
spec:
|
spec:
|
||||||
tls:
|
tls:
|
||||||
- hosts: [ "pegasus.bstein.dev" ]
|
- hosts: [ "pegasus.bstein.dev" ]
|
||||||
|
|||||||
@ -8,7 +8,7 @@ spec:
|
|||||||
secretName: vault-server-tls
|
secretName: vault-server-tls
|
||||||
issuerRef:
|
issuerRef:
|
||||||
kind: ClusterIssuer
|
kind: ClusterIssuer
|
||||||
name: letsencrypt-prod
|
name: letsencrypt
|
||||||
commonName: secret.bstein.dev
|
commonName: secret.bstein.dev
|
||||||
dnsNames:
|
dnsNames:
|
||||||
- secret.bstein.dev
|
- secret.bstein.dev
|
||||||
|
|||||||
@ -5,7 +5,7 @@ metadata:
|
|||||||
name: zot
|
name: zot
|
||||||
namespace: zot
|
namespace: zot
|
||||||
annotations:
|
annotations:
|
||||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd
|
traefik.ingress.kubernetes.io/router.middlewares: zot-zot-resp-headers@kubernetescrd
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user