Compare commits
6 Commits
64b4f14018
...
616c6308b1
| Author | SHA1 | Date | |
|---|---|---|---|
| 616c6308b1 | |||
| d9b30d6c5b | |||
| 7c337ad5a1 | |||
| 3823b68ee2 | |||
| 40de2b59a5 | |||
| 5483c04bb3 |
@ -35,6 +35,7 @@ data:
|
|||||||
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
||||||
PUBLIC_FOLDER = "overview"
|
PUBLIC_FOLDER = "overview"
|
||||||
PRIVATE_FOLDER = "atlas-internal"
|
PRIVATE_FOLDER = "atlas-internal"
|
||||||
|
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
|
||||||
|
|
||||||
PERCENT_THRESHOLDS = {
|
PERCENT_THRESHOLDS = {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
@ -156,6 +157,10 @@ def root_usage_expr(scope=""):
|
|||||||
return filesystem_usage_expr("/", scope)
|
return filesystem_usage_expr("/", scope)
|
||||||
|
|
||||||
|
|
||||||
|
def astraios_usage_expr(scope=""):
|
||||||
|
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
|
||||||
|
|
||||||
|
|
||||||
def astreae_usage_expr(mount):
|
def astreae_usage_expr(mount):
|
||||||
return (
|
return (
|
||||||
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
||||||
@ -533,6 +538,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
|
|||||||
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
||||||
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
||||||
)
|
)
|
||||||
|
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
|
||||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||||
ANANKE_UPS_DB_NODE = "titan-db"
|
ANANKE_UPS_DB_NODE = "titan-db"
|
||||||
@ -1601,26 +1607,27 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
bargauge_panel(
|
bargauge_panel(
|
||||||
47,
|
47,
|
||||||
"Platform Suite Pass Rate (24h)",
|
"PVC Backup Health / Age",
|
||||||
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
|
PVC_BACKUP_AGE_HOURS_BY_PVC,
|
||||||
{"h": 5, "w": 6, "x": 18, "y": 7},
|
{"h": 5, "w": 6, "x": 18, "y": 7},
|
||||||
unit="percent",
|
unit="h",
|
||||||
instant=True,
|
instant=True,
|
||||||
legend="{{suite}}",
|
legend="{{namespace}}/{{pvc}}",
|
||||||
sort_order="desc",
|
sort_order="desc",
|
||||||
thresholds={
|
thresholds={
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": None},
|
{"color": "green", "value": None},
|
||||||
{"color": "yellow", "value": 80},
|
{"color": "yellow", "value": 6},
|
||||||
{"color": "green", "value": 95},
|
{"color": "orange", "value": 12},
|
||||||
|
{"color": "red", "value": 24},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels[-1]["links"] = link_to("atlas-jobs")
|
panels[-1]["links"] = link_to("atlas-storage")
|
||||||
panels[-1]["description"] = (
|
panels[-1]["description"] = (
|
||||||
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
)
|
)
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -1908,8 +1915,8 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
bargauge_panel(
|
bargauge_panel(
|
||||||
22,
|
22,
|
||||||
"Nodes Closest to Full Root Disks",
|
"Nodes Closest to Full Astraios Disks",
|
||||||
f"topk(12, {root_usage_expr()})",
|
f"topk(12, {astraios_usage_expr()})",
|
||||||
{"h": 16, "w": 12, "x": 12, "y": 71},
|
{"h": 16, "w": 12, "x": 12, "y": 71},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
thresholds=PERCENT_THRESHOLDS,
|
||||||
@ -2282,6 +2289,19 @@ def build_nodes_dashboard():
|
|||||||
time_from="30d",
|
time_from="30d",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
panels.append(
|
||||||
|
timeseries_panel(
|
||||||
|
9,
|
||||||
|
"Astraios Usage",
|
||||||
|
astraios_usage_expr(),
|
||||||
|
{"h": 9, "w": 24, "x": 0, "y": 44},
|
||||||
|
unit="percent",
|
||||||
|
legend="{{node}}",
|
||||||
|
legend_display="table",
|
||||||
|
legend_placement="right",
|
||||||
|
time_from="30d",
|
||||||
|
)
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-nodes",
|
"uid": "atlas-nodes",
|
||||||
"title": "Atlas Nodes",
|
"title": "Atlas Nodes",
|
||||||
|
|||||||
@ -584,6 +584,44 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "30d"
|
"timeFrom": "30d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Astraios Usage",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 9,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeFrom": "30d"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1957,7 +1957,7 @@
|
|||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Suite Pass Rate (24h)",
|
"title": "PVC Backup Health / Age",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1970,31 +1970,35 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{suite}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"unit": "percent",
|
"unit": "h",
|
||||||
"min": 0,
|
"min": 0,
|
||||||
"max": 100,
|
"max": null,
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "green",
|
||||||
"value": null
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
"value": 80
|
"value": 6
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "orange",
|
||||||
"value": 95
|
"value": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 24
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -2025,12 +2029,12 @@
|
|||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Open atlas-jobs dashboard",
|
"title": "Open atlas-storage dashboard",
|
||||||
"url": "/d/atlas-jobs",
|
"url": "/d/atlas-storage",
|
||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
@ -3172,7 +3176,7 @@
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes Closest to Full Root Disks",
|
"title": "Nodes Closest to Full Astraios Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3185,7 +3189,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -593,6 +593,44 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "30d"
|
"timeFrom": "30d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Astraios Usage",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 9,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeFrom": "30d"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1966,7 +1966,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Suite Pass Rate (24h)",
|
"title": "PVC Backup Health / Age",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1979,31 +1979,35 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{suite}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"unit": "percent",
|
"unit": "h",
|
||||||
"min": 0,
|
"min": 0,
|
||||||
"max": 100,
|
"max": null,
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "green",
|
||||||
"value": null
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
"value": 80
|
"value": 6
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "orange",
|
||||||
"value": 95
|
"value": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 24
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -2034,12 +2038,12 @@ data:
|
|||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Open atlas-jobs dashboard",
|
"title": "Open atlas-storage dashboard",
|
||||||
"url": "/d/atlas-jobs",
|
"url": "/d/atlas-storage",
|
||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
@ -3181,7 +3185,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes Closest to Full Root Disks",
|
"title": "Nodes Closest to Full Astraios Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3194,7 +3198,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user