monitoring: trial overview health timelines

This commit is contained in:
jenkins 2026-05-16 05:08:09 -03:00
parent 2ede953580
commit 5d01b3a60d
6 changed files with 275 additions and 210 deletions

View File

@ -1087,6 +1087,7 @@ def apply_bar_timeseries_style(panel, *, stacked=False, fill_opacity=70):
panel["fieldConfig"]["defaults"]["custom"] = {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": fill_opacity,
"gradientMode": "none",
@ -1515,8 +1516,8 @@ DASHBOARD_LINK_TITLES = {
"atlas-storage": "Open Atlas Storage",
"atlas-network": "Open Atlas Network",
"atlas-mail": "Open Atlas Mail",
"atlas-jobs": "Open Atlas Testing",
"atlas-testing": "Open Atlas Testing",
"atlas-jobs": "Atlas Testing",
"atlas-testing": "Atlas Testing",
"atlas-power": "Open Atlas Power",
"atlas-gitops": "Open Atlas GitOps",
"atlas-gpu": "Open Atlas GPU",
@ -1550,10 +1551,18 @@ def build_overview():
climate_temp_series = f"max without ({climate_drop_labels}) (typhon_temperature_celsius != 0)"
climate_humidity_series = f"max without ({climate_drop_labels}) (typhon_relative_humidity_percent != 0)"
climate_pressure_series = f"max without ({climate_drop_labels}) (typhon_vpd_kpa != 0)"
overview_pvc_backup_metric_presence = (
'count({__name__=~"pvc_backup_(count|last_success_timestamp_seconds|health_reason)",driver="restic"})'
)
overview_pvc_backup_missing = (
'label_replace(label_replace(vector(999), "namespace", "maintenance", "__name__", ".*"), '
'"pvc", "backup-telemetry-missing", "__name__", ".*")'
)
overview_pvc_backup_age = (
'max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) '
'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) '
'* (pvc_backup_count{driver="restic"} > bool 0)) * 999))) or on() vector(0)'
f'* (pvc_backup_count{{driver="restic"}} > bool 0)) * 999))) or on() '
f'(({overview_pvc_backup_missing}) unless on() (({overview_pvc_backup_metric_presence}) > 0))'
)
def overview_metric_pair_expr(first_expr, first_name, second_expr, second_name):
@ -1822,6 +1831,28 @@ def build_overview():
{"color": dark_blue, "value": 100},
],
}
fan_intensity_thresholds = {
"mode": "absolute",
"steps": [
{"color": dark_blue, "value": None},
{"color": dark_green, "value": 3},
{"color": dark_yellow, "value": 6},
{"color": dark_orange, "value": 8},
{"color": dark_red, "value": 10},
],
}
fan_intensity_expr = (
f'label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}}), "fan", "Outlet", "__name__", ".*") '
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}}), "fan", "Inlet - Inside", "__name__", ".*") '
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}}), "fan", "Inlet - Outside", "__name__", ".*") '
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}}), "fan", "Interior", "__name__", ".*")'
)
gitops_health_history_expr = (
f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") '
f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") '
f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") '
f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")'
)
compact_current_text = {"titleSize": 11, "valueSize": 20}
perfect_count_thresholds = {
"mode": "absolute",
@ -1891,6 +1922,7 @@ def build_overview():
links=overview_link("atlas-power"),
),
stacked=False,
fill_opacity=55,
)
)
temp_panel = stat_panel(
@ -2049,25 +2081,22 @@ def build_overview():
"showPoints": "never",
"spanNulls": True,
}
panels.append(
timeseries_panel(
141,
"Fan History (0-10)",
None,
{"h": 6, "w": 6, "x": 9, "y": 13},
unit="none",
max_value=10,
targets=[
{"refId": "A", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}})', "legendFormat": "Outlet"},
{"refId": "B", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}})', "legendFormat": "Inlet - Inside"},
{"refId": "C", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}})', "legendFormat": "Inlet - Outside"},
{"refId": "D", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}})', "legendFormat": "Interior"},
],
legend_display="table",
legend_placement="right",
links=overview_link("atlas-power"),
)
fan_panel = state_timeline_panel(
141,
"Fan Intensity History",
fan_intensity_expr,
{"h": 6, "w": 6, "x": 9, "y": 13},
unit="none",
min_value=0,
max_value=10,
legend="{{fan}}",
thresholds=fan_intensity_thresholds,
links=overview_link("atlas-power"),
description="Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.",
)
fan_panel["options"]["legend"] = {"displayMode": "table", "placement": "right"}
fan_panel["options"]["tooltip"] = {"mode": "multi", "sort": "none"}
panels.append(fan_panel)
flux_source = stat_panel(
140,
"Flux Source",
@ -2118,24 +2147,18 @@ def build_overview():
rail_panel["options"]["text"] = {"titleSize": 10, "valueSize": 19}
panels.append(rail_panel)
panels.append(
bargauge_panel(
state_timeline_panel(
150,
"GitOps Health",
(
f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") '
f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") '
f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") '
f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")'
),
{"h": 6, "w": 6, "x": 15, "y": 13},
gitops_health_history_expr,
{"h": 6, "w": 6, "x": 15, "y": 7},
unit="percent",
instant=True,
min_value=0,
max_value=100,
legend="{{signal}}",
sort_order="asc",
thresholds=test_success_thresholds,
decimals=0,
links=overview_link("atlas-gitops"),
description="Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.",
description="GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.",
)
)
@ -2174,23 +2197,20 @@ def build_overview():
)
panels.append(apply_bar_timeseries_style(ariadne_volume, stacked=False))
panels.append(
bargauge_panel(
state_timeline_panel(
46,
"Gate Checks Passing by Suite",
PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE,
{"h": 6, "w": 6, "x": 15, "y": 7},
{"h": 6, "w": 6, "x": 15, "y": 13},
unit="percent",
instant=True,
min_value=0,
max_value=100,
legend="{{suite}}",
sort_order="asc",
thresholds=test_success_thresholds,
decimals=0,
links=overview_link("atlas-testing"),
description="Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.",
)
)
panels[-1]["description"] = (
"Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing."
)
for panel_id, title, metric, x_pos, description in [
(
142,
@ -4773,7 +4793,7 @@ def build_power_dashboard():
panels.append(
timeseries_panel(
6,
"Fan History (0-10)",
"Fan Intensity History",
None,
{"h": 8, "w": 12, "x": 12, "y": 16},
unit="none",

View File

@ -68,19 +68,36 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])}
assert dashboard["links"] == [
{"title": "Open Atlas Testing", "url": "/d/atlas-testing", "targetBlank": True}
{"title": "Atlas Testing", "url": "/d/atlas-testing", "targetBlank": True}
]
assert "atlas-jobs" not in repr(dashboard)
assert "Platform Test Success Rate" not in panels_by_title
assert panels_by_title["Gate Checks Passing by Suite"]["type"] == "bargauge"
assert panels_by_title["Gate Checks Passing by Suite"]["options"]["displayMode"] == "basic"
assert panels_by_title["Gate Checks Passing by Suite"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7}
assert panels_by_title["Gate Checks Passing by Suite"]["type"] == "state-timeline"
assert panels_by_title["Gate Checks Passing by Suite"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13}
assert panels_by_title["Gate Checks Passing by Suite"]["targets"][0]["legendFormat"] == "{{suite}}"
assert panels_by_title["UPS History (Power Draw)"]["gridPos"] == {"h": 6, "w": 6, "x": 3, "y": 7}
assert panels_by_title["Ariadne Run Volume"]["gridPos"] == {"h": 6, "w": 6, "x": 9, "y": 7}
assert panels_by_title["Pyrphoros UPS Current"]["gridPos"]["w"] == 3
assert panels_by_title["Current Enclosure Climate"]["gridPos"]["w"] == 3
assert panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["defaults"]["custom"]["drawStyle"] == "bars"
assert panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["defaults"]["custom"]["barWidthFactor"] == 0.72
ups_overrides = panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["overrides"]
assert any(
override["matcher"]["options"] == "Pyrphoros"
and override["properties"][0]["value"] == {"mode": "fixed", "fixedColor": "dark-blue"}
for override in ups_overrides
)
assert any(
override["matcher"]["options"] == "Statera"
and override["properties"][0]["value"] == {"mode": "fixed", "fixedColor": "dark-yellow"}
for override in ups_overrides
)
assert panels_by_title["Ariadne Run Volume"]["fieldConfig"]["defaults"]["custom"]["drawStyle"] == "bars"
assert "Fan History (0-10)" not in panels_by_title
assert panels_by_title["Fan Intensity History"]["type"] == "state-timeline"
assert panels_by_title["Fan Intensity History"]["gridPos"] == {"h": 6, "w": 6, "x": 9, "y": 13}
assert panels_by_title["Fan Intensity History"]["fieldConfig"]["defaults"]["max"] == 10
assert panels_by_title["Fan Intensity History"]["targets"][0]["legendFormat"] == "{{fan}}"
assert panels_by_title["Flux Source"]["type"] == "stat"
assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7}
@ -88,12 +105,16 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
assert panels_by_title["Run Reliability (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9}
assert panels_by_title["Fresh Suites (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 13}
assert panels_by_title["LOC Clean Suites"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 17}
assert panels_by_title["GitOps Health"]["type"] == "bargauge"
assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13}
assert panels_by_title["GitOps Health"]["type"] == "state-timeline"
assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7}
gitops_expr = panels_by_title["GitOps Health"]["targets"][0]["expr"]
assert "Kustomizations Not Suspended" in gitops_expr
assert "HelmReleases Not Suspended" in gitops_expr
pvc_backup_expr = panels_by_title["PVC Backup Health / Age"]["targets"][0]["expr"]
assert "backup-telemetry-missing" in pvc_backup_expr
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
def test_render_configmap_writes(tmp_path):
mod = load_module()

View File

@ -1300,8 +1300,9 @@
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"fillOpacity": 55,
"gradientMode": "none",
"showPoints": "never",
"spanNulls": true
@ -1862,8 +1863,9 @@
},
{
"id": 141,
"type": "timeseries",
"title": "Fan History (0-10)",
"type": "state-timeline",
"title": "Fan Intensity History",
"description": "Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1876,40 +1878,62 @@
},
"targets": [
{
"expr": "label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"}), \"fan\", \"Outlet\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"}), \"fan\", \"Inlet - Inside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"}), \"fan\", \"Inlet - Outside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"}), \"fan\", \"Interior\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})",
"legendFormat": "Outlet"
},
{
"refId": "B",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})",
"legendFormat": "Inlet - Inside"
},
{
"refId": "C",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})",
"legendFormat": "Inlet - Outside"
},
{
"refId": "D",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})",
"legendFormat": "Interior"
"legendFormat": "{{fan}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"unit": "none",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-blue",
"value": null
},
{
"color": "dark-green",
"value": 3
},
{
"color": "dark-yellow",
"value": 6
},
{
"color": "dark-orange",
"value": 8
},
{
"color": "dark-red",
"value": 10
}
]
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 10
},
"overrides": []
},
"options": {
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
"mode": "multi",
"sort": "none"
}
},
"links": [
@ -2409,8 +2433,9 @@
},
{
"id": 150,
"type": "bargauge",
"type": "state-timeline",
"title": "GitOps Health",
"description": "GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2419,14 +2444,13 @@
"h": 6,
"w": 6,
"x": 15,
"y": 13
"y": 7
},
"targets": [
{
"expr": "sort(label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\"))",
"expr": "label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{signal}}",
"instant": true
"legendFormat": "{{signal}}"
}
],
"fieldConfig": {
@ -2435,8 +2459,6 @@
"mode": "thresholds"
},
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
@ -2462,19 +2484,26 @@
}
]
},
"decimals": 0
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"links": [
@ -2483,18 +2512,6 @@
"url": "/d/atlas-gitops",
"targetBlank": true
}
],
"description": "Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.",
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
]
},
{
@ -2617,6 +2634,7 @@
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"gradientMode": "none",
@ -2676,8 +2694,9 @@
},
{
"id": 46,
"type": "bargauge",
"type": "state-timeline",
"title": "Gate Checks Passing by Suite",
"description": "Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2686,14 +2705,13 @@
"h": 6,
"w": 6,
"x": 15,
"y": 7
"y": 13
},
"targets": [
{
"expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)))",
"expr": "(100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
"legendFormat": "{{suite}}"
}
],
"fieldConfig": {
@ -2702,8 +2720,6 @@
"mode": "thresholds"
},
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
@ -2729,19 +2745,26 @@
}
]
},
"decimals": 0
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"links": [
@ -2750,19 +2773,7 @@
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
],
"description": "Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing."
]
},
{
"id": 142,
@ -2994,7 +3005,7 @@
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() vector(0))",
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() ((label_replace(label_replace(vector(999), \"namespace\", \"maintenance\", \"__name__\", \".*\"), \"pvc\", \"backup-telemetry-missing\", \"__name__\", \".*\")) unless on() ((count({__name__=~\"pvc_backup_(count|last_success_timestamp_seconds|health_reason)\",driver=\"restic\"})) > 0)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -4365,7 +4376,7 @@
"refresh": "1m",
"links": [
{
"title": "Open Atlas Testing",
"title": "Atlas Testing",
"url": "/d/atlas-testing",
"targetBlank": true
}

View File

@ -253,6 +253,7 @@
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"gradientMode": "none",
@ -618,7 +619,7 @@
{
"id": 6,
"type": "timeseries",
"title": "Fan History (0-10)",
"title": "Fan Intensity History",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"

View File

@ -1309,8 +1309,9 @@ data:
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"fillOpacity": 55,
"gradientMode": "none",
"showPoints": "never",
"spanNulls": true
@ -1871,8 +1872,9 @@ data:
},
{
"id": 141,
"type": "timeseries",
"title": "Fan History (0-10)",
"type": "state-timeline",
"title": "Fan Intensity History",
"description": "Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1885,40 +1887,62 @@ data:
},
"targets": [
{
"expr": "label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"}), \"fan\", \"Outlet\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"}), \"fan\", \"Inlet - Inside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"}), \"fan\", \"Inlet - Outside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"}), \"fan\", \"Interior\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})",
"legendFormat": "Outlet"
},
{
"refId": "B",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})",
"legendFormat": "Inlet - Inside"
},
{
"refId": "C",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})",
"legendFormat": "Inlet - Outside"
},
{
"refId": "D",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})",
"legendFormat": "Interior"
"legendFormat": "{{fan}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"unit": "none",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-blue",
"value": null
},
{
"color": "dark-green",
"value": 3
},
{
"color": "dark-yellow",
"value": 6
},
{
"color": "dark-orange",
"value": 8
},
{
"color": "dark-red",
"value": 10
}
]
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 10
},
"overrides": []
},
"options": {
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
"mode": "multi",
"sort": "none"
}
},
"links": [
@ -2418,8 +2442,9 @@ data:
},
{
"id": 150,
"type": "bargauge",
"type": "state-timeline",
"title": "GitOps Health",
"description": "GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2428,14 +2453,13 @@ data:
"h": 6,
"w": 6,
"x": 15,
"y": 13
"y": 7
},
"targets": [
{
"expr": "sort(label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\"))",
"expr": "label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{signal}}",
"instant": true
"legendFormat": "{{signal}}"
}
],
"fieldConfig": {
@ -2444,8 +2468,6 @@ data:
"mode": "thresholds"
},
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
@ -2471,19 +2493,26 @@ data:
}
]
},
"decimals": 0
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"links": [
@ -2492,18 +2521,6 @@ data:
"url": "/d/atlas-gitops",
"targetBlank": true
}
],
"description": "Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.",
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
]
},
{
@ -2626,6 +2643,7 @@ data:
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"gradientMode": "none",
@ -2685,8 +2703,9 @@ data:
},
{
"id": 46,
"type": "bargauge",
"type": "state-timeline",
"title": "Gate Checks Passing by Suite",
"description": "Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2695,14 +2714,13 @@ data:
"h": 6,
"w": 6,
"x": 15,
"y": 7
"y": 13
},
"targets": [
{
"expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)))",
"expr": "(100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
"legendFormat": "{{suite}}"
}
],
"fieldConfig": {
@ -2711,8 +2729,6 @@ data:
"mode": "thresholds"
},
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
@ -2738,19 +2754,26 @@ data:
}
]
},
"decimals": 0
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"links": [
@ -2759,19 +2782,7 @@ data:
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
],
"description": "Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing."
]
},
{
"id": 142,
@ -3003,7 +3014,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() vector(0))",
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() ((label_replace(label_replace(vector(999), \"namespace\", \"maintenance\", \"__name__\", \".*\"), \"pvc\", \"backup-telemetry-missing\", \"__name__\", \".*\")) unless on() ((count({__name__=~\"pvc_backup_(count|last_success_timestamp_seconds|health_reason)\",driver=\"restic\"})) > 0)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -4374,7 +4385,7 @@ data:
"refresh": "1m",
"links": [
{
"title": "Open Atlas Testing",
"title": "Atlas Testing",
"url": "/d/atlas-testing",
"targetBlank": true
}

View File

@ -262,6 +262,7 @@ data:
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"gradientMode": "none",
@ -627,7 +628,7 @@ data:
{
"id": 6,
"type": "timeseries",
"title": "Fan History (0-10)",
"title": "Fan Intensity History",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"