Compare commits

...

6 Commits

5 changed files with 143 additions and 39 deletions

View File

@ -35,6 +35,7 @@ data:
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
PERCENT_THRESHOLDS = {
"mode": "absolute",
@ -156,6 +157,10 @@ def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
@ -533,6 +538,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1601,26 +1607,27 @@ def build_overview():
panels.append(
bargauge_panel(
47,
"Platform Suite Pass Rate (24h)",
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
"PVC Backup Health / Age",
PVC_BACKUP_AGE_HOURS_BY_PVC,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="percent",
unit="h",
instant=True,
legend="{{suite}}",
legend="{{namespace}}/{{pvc}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95},
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 12},
{"color": "red", "value": 24},
],
},
)
)
panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
)
panels.append(
@ -1908,8 +1915,8 @@ def build_overview():
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
"Nodes Closest to Full Astraios Disks",
f"topk(12, {astraios_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
@ -2282,6 +2289,19 @@ def build_nodes_dashboard():
time_from="30d",
)
)
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",

View File

@ -584,6 +584,44 @@
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1957,7 +1957,7 @@
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1970,31 +1970,35 @@
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2025,12 +2029,12 @@
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3172,7 +3176,7 @@
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3185,7 +3189,7 @@
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -593,6 +593,44 @@ data:
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1966,7 +1966,7 @@ data:
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1979,31 +1979,35 @@ data:
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2034,12 +2038,12 @@ data:
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3181,7 +3185,7 @@ data:
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3194,7 +3198,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}