Compare commits

..

No commits in common. "616c6308b1e3c3cbb3c7ab31449e699861e07bf8" and "64b4f140184b5c990eee9088c3e6cb59f8b80821" have entirely different histories.

5 changed files with 39 additions and 143 deletions

View File

@ -35,7 +35,6 @@ data:
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview" PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal" PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
PERCENT_THRESHOLDS = { PERCENT_THRESHOLDS = {
"mode": "absolute", "mode": "absolute",
@ -157,10 +156,6 @@ def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope) return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
def astreae_usage_expr(mount): def astreae_usage_expr(mount):
return ( return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
@ -538,7 +533,6 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) ' f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))' f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
) )
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db" ANANKE_UPS_DB_NODE = "titan-db"
@ -1607,27 +1601,26 @@ def build_overview():
panels.append( panels.append(
bargauge_panel( bargauge_panel(
47, 47,
"PVC Backup Health / Age", "Platform Suite Pass Rate (24h)",
PVC_BACKUP_AGE_HOURS_BY_PVC, PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
{"h": 5, "w": 6, "x": 18, "y": 7}, {"h": 5, "w": 6, "x": 18, "y": 7},
unit="h", unit="percent",
instant=True, instant=True,
legend="{{namespace}}/{{pvc}}", legend="{{suite}}",
sort_order="desc", sort_order="desc",
thresholds={ thresholds={
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": None}, {"color": "red", "value": None},
{"color": "yellow", "value": 6}, {"color": "yellow", "value": 80},
{"color": "orange", "value": 12}, {"color": "green", "value": 95},
{"color": "red", "value": 24},
], ],
}, },
) )
) )
panels[-1]["links"] = link_to("atlas-storage") panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["description"] = ( panels[-1]["description"] = (
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
) )
panels.append( panels.append(
@ -1915,8 +1908,8 @@ def build_overview():
panels.append( panels.append(
bargauge_panel( bargauge_panel(
22, 22,
"Nodes Closest to Full Astraios Disks", "Nodes Closest to Full Root Disks",
f"topk(12, {astraios_usage_expr()})", f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 71}, {"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent", unit="percent",
thresholds=PERCENT_THRESHOLDS, thresholds=PERCENT_THRESHOLDS,
@ -2289,19 +2282,6 @@ def build_nodes_dashboard():
time_from="30d", time_from="30d",
) )
) )
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return { return {
"uid": "atlas-nodes", "uid": "atlas-nodes",
"title": "Atlas Nodes", "title": "Atlas Nodes",

View File

@ -584,44 +584,6 @@
} }
}, },
"timeFrom": "30d" "timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
} }
], ],
"time": { "time": {

View File

@ -1957,7 +1957,7 @@
{ {
"id": 47, "id": 47,
"type": "bargauge", "type": "bargauge",
"title": "PVC Backup Health / Age", "title": "Platform Suite Pass Rate (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1970,35 +1970,31 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}", "legendFormat": "{{suite}}",
"instant": true "instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "h", "unit": "percent",
"min": 0, "min": 0,
"max": null, "max": 100,
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "red",
"value": null "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 6 "value": 80
}, },
{ {
"color": "orange", "color": "green",
"value": 12 "value": 95
},
{
"color": "red",
"value": 24
} }
] ]
} }
@ -2029,12 +2025,12 @@
], ],
"links": [ "links": [
{ {
"title": "Open atlas-storage dashboard", "title": "Open atlas-jobs dashboard",
"url": "/d/atlas-storage", "url": "/d/atlas-jobs",
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
}, },
{ {
"id": 30, "id": 30,
@ -3176,7 +3172,7 @@
{ {
"id": 22, "id": 22,
"type": "bargauge", "type": "bargauge",
"title": "Nodes Closest to Full Astraios Disks", "title": "Nodes Closest to Full Root Disks",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -3189,7 +3185,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}"
} }

View File

@ -593,44 +593,6 @@ data:
} }
}, },
"timeFrom": "30d" "timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
} }
], ],
"time": { "time": {

View File

@ -1966,7 +1966,7 @@ data:
{ {
"id": 47, "id": 47,
"type": "bargauge", "type": "bargauge",
"title": "PVC Backup Health / Age", "title": "Platform Suite Pass Rate (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1979,35 +1979,31 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}", "legendFormat": "{{suite}}",
"instant": true "instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "h", "unit": "percent",
"min": 0, "min": 0,
"max": null, "max": 100,
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "red",
"value": null "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
"value": 6 "value": 80
}, },
{ {
"color": "orange", "color": "green",
"value": 12 "value": 95
},
{
"color": "red",
"value": 24
} }
] ]
} }
@ -2038,12 +2034,12 @@ data:
], ],
"links": [ "links": [
{ {
"title": "Open atlas-storage dashboard", "title": "Open atlas-jobs dashboard",
"url": "/d/atlas-storage", "url": "/d/atlas-jobs",
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
}, },
{ {
"id": 30, "id": 30,
@ -3185,7 +3181,7 @@ data:
{ {
"id": 22, "id": 22,
"type": "bargauge", "type": "bargauge",
"title": "Nodes Closest to Full Astraios Disks", "title": "Nodes Closest to Full Root Disks",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -3198,7 +3194,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}"
} }