diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1383a406..a666ec81 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1087,6 +1087,7 @@ def apply_bar_timeseries_style(panel, *, stacked=False, fill_opacity=70): panel["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": fill_opacity, "gradientMode": "none", @@ -1515,8 +1516,8 @@ DASHBOARD_LINK_TITLES = { "atlas-storage": "Open Atlas Storage", "atlas-network": "Open Atlas Network", "atlas-mail": "Open Atlas Mail", - "atlas-jobs": "Open Atlas Testing", - "atlas-testing": "Open Atlas Testing", + "atlas-jobs": "Atlas Testing", + "atlas-testing": "Atlas Testing", "atlas-power": "Open Atlas Power", "atlas-gitops": "Open Atlas GitOps", "atlas-gpu": "Open Atlas GPU", @@ -1550,10 +1551,18 @@ def build_overview(): climate_temp_series = f"max without ({climate_drop_labels}) (typhon_temperature_celsius != 0)" climate_humidity_series = f"max without ({climate_drop_labels}) (typhon_relative_humidity_percent != 0)" climate_pressure_series = f"max without ({climate_drop_labels}) (typhon_vpd_kpa != 0)" + overview_pvc_backup_metric_presence = ( + 'count({__name__=~"pvc_backup_(count|last_success_timestamp_seconds|health_reason)",driver="restic"})' + ) + overview_pvc_backup_missing = ( + 'label_replace(label_replace(vector(999), "namespace", "maintenance", "__name__", ".*"), ' + '"pvc", "backup-telemetry-missing", "__name__", ".*")' + ) overview_pvc_backup_age = ( 'max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) ' 'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) ' - '* (pvc_backup_count{driver="restic"} > bool 0)) * 999))) or on() vector(0)' + f'* (pvc_backup_count{{driver="restic"}} > bool 0)) * 999))) or on() ' + f'(({overview_pvc_backup_missing}) unless on() (({overview_pvc_backup_metric_presence}) > 0))' ) def overview_metric_pair_expr(first_expr, first_name, second_expr, second_name): @@ -1822,6 +1831,28 @@ def build_overview(): {"color": dark_blue, "value": 100}, ], } + fan_intensity_thresholds = { + "mode": "absolute", + "steps": [ + {"color": dark_blue, "value": None}, + {"color": dark_green, "value": 3}, + {"color": dark_yellow, "value": 6}, + {"color": dark_orange, "value": 8}, + {"color": dark_red, "value": 10}, + ], + } + fan_intensity_expr = ( + f'label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}}), "fan", "Outlet", "__name__", ".*") ' + f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}}), "fan", "Inlet - Inside", "__name__", ".*") ' + f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}}), "fan", "Inlet - Outside", "__name__", ".*") ' + f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}}), "fan", "Interior", "__name__", ".*")' + ) + gitops_health_history_expr = ( + f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") ' + f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") ' + f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") ' + f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")' + ) compact_current_text = {"titleSize": 11, "valueSize": 20} perfect_count_thresholds = { "mode": "absolute", @@ -1891,6 +1922,7 @@ def build_overview(): links=overview_link("atlas-power"), ), stacked=False, + fill_opacity=55, ) ) temp_panel = stat_panel( @@ -2049,25 +2081,22 @@ def build_overview(): "showPoints": "never", "spanNulls": True, } - panels.append( - timeseries_panel( - 141, - "Fan History (0-10)", - None, - {"h": 6, "w": 6, "x": 9, "y": 13}, - unit="none", - max_value=10, - targets=[ - {"refId": "A", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}})', "legendFormat": "Outlet"}, - {"refId": "B", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}})', "legendFormat": "Inlet - Inside"}, - {"refId": "C", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}})', "legendFormat": "Inlet - Outside"}, - {"refId": "D", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}})', "legendFormat": "Interior"}, - ], - legend_display="table", - legend_placement="right", - links=overview_link("atlas-power"), - ) + fan_panel = state_timeline_panel( + 141, + "Fan Intensity History", + fan_intensity_expr, + {"h": 6, "w": 6, "x": 9, "y": 13}, + unit="none", + min_value=0, + max_value=10, + legend="{{fan}}", + thresholds=fan_intensity_thresholds, + links=overview_link("atlas-power"), + description="Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.", ) + fan_panel["options"]["legend"] = {"displayMode": "table", "placement": "right"} + fan_panel["options"]["tooltip"] = {"mode": "multi", "sort": "none"} + panels.append(fan_panel) flux_source = stat_panel( 140, "Flux Source", @@ -2118,24 +2147,18 @@ def build_overview(): rail_panel["options"]["text"] = {"titleSize": 10, "valueSize": 19} panels.append(rail_panel) panels.append( - bargauge_panel( + state_timeline_panel( 150, "GitOps Health", - ( - f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") ' - f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") ' - f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") ' - f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")' - ), - {"h": 6, "w": 6, "x": 15, "y": 13}, + gitops_health_history_expr, + {"h": 6, "w": 6, "x": 15, "y": 7}, unit="percent", - instant=True, + min_value=0, + max_value=100, legend="{{signal}}", - sort_order="asc", thresholds=test_success_thresholds, - decimals=0, links=overview_link("atlas-gitops"), - description="Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.", + description="GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.", ) ) @@ -2174,23 +2197,20 @@ def build_overview(): ) panels.append(apply_bar_timeseries_style(ariadne_volume, stacked=False)) panels.append( - bargauge_panel( + state_timeline_panel( 46, "Gate Checks Passing by Suite", PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE, - {"h": 6, "w": 6, "x": 15, "y": 7}, + {"h": 6, "w": 6, "x": 15, "y": 13}, unit="percent", - instant=True, + min_value=0, + max_value=100, legend="{{suite}}", - sort_order="asc", thresholds=test_success_thresholds, - decimals=0, links=overview_link("atlas-testing"), + description="Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.", ) ) - panels[-1]["description"] = ( - "Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing." - ) for panel_id, title, metric, x_pos, description in [ ( 142, @@ -4773,7 +4793,7 @@ def build_power_dashboard(): panels.append( timeseries_panel( 6, - "Fan History (0-10)", + "Fan Intensity History", None, {"h": 8, "w": 12, "x": 12, "y": 16}, unit="none", diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index d99585fe..daf72338 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -68,19 +68,36 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])} assert dashboard["links"] == [ - {"title": "Open Atlas Testing", "url": "/d/atlas-testing", "targetBlank": True} + {"title": "Atlas Testing", "url": "/d/atlas-testing", "targetBlank": True} ] assert "atlas-jobs" not in repr(dashboard) assert "Platform Test Success Rate" not in panels_by_title - assert panels_by_title["Gate Checks Passing by Suite"]["type"] == "bargauge" - assert panels_by_title["Gate Checks Passing by Suite"]["options"]["displayMode"] == "basic" - assert panels_by_title["Gate Checks Passing by Suite"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7} + assert panels_by_title["Gate Checks Passing by Suite"]["type"] == "state-timeline" + assert panels_by_title["Gate Checks Passing by Suite"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13} + assert panels_by_title["Gate Checks Passing by Suite"]["targets"][0]["legendFormat"] == "{{suite}}" assert panels_by_title["UPS History (Power Draw)"]["gridPos"] == {"h": 6, "w": 6, "x": 3, "y": 7} assert panels_by_title["Ariadne Run Volume"]["gridPos"] == {"h": 6, "w": 6, "x": 9, "y": 7} assert panels_by_title["Pyrphoros UPS Current"]["gridPos"]["w"] == 3 assert panels_by_title["Current Enclosure Climate"]["gridPos"]["w"] == 3 assert panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["defaults"]["custom"]["drawStyle"] == "bars" + assert panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["defaults"]["custom"]["barWidthFactor"] == 0.72 + ups_overrides = panels_by_title["UPS History (Power Draw)"]["fieldConfig"]["overrides"] + assert any( + override["matcher"]["options"] == "Pyrphoros" + and override["properties"][0]["value"] == {"mode": "fixed", "fixedColor": "dark-blue"} + for override in ups_overrides + ) + assert any( + override["matcher"]["options"] == "Statera" + and override["properties"][0]["value"] == {"mode": "fixed", "fixedColor": "dark-yellow"} + for override in ups_overrides + ) assert panels_by_title["Ariadne Run Volume"]["fieldConfig"]["defaults"]["custom"]["drawStyle"] == "bars" + assert "Fan History (0-10)" not in panels_by_title + assert panels_by_title["Fan Intensity History"]["type"] == "state-timeline" + assert panels_by_title["Fan Intensity History"]["gridPos"] == {"h": 6, "w": 6, "x": 9, "y": 13} + assert panels_by_title["Fan Intensity History"]["fieldConfig"]["defaults"]["max"] == 10 + assert panels_by_title["Fan Intensity History"]["targets"][0]["legendFormat"] == "{{fan}}" assert panels_by_title["Flux Source"]["type"] == "stat" assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7} @@ -88,12 +105,16 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert panels_by_title["Run Reliability (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9} assert panels_by_title["Fresh Suites (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 13} assert panels_by_title["LOC Clean Suites"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 17} - assert panels_by_title["GitOps Health"]["type"] == "bargauge" - assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13} + assert panels_by_title["GitOps Health"]["type"] == "state-timeline" + assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7} gitops_expr = panels_by_title["GitOps Health"]["targets"][0]["expr"] assert "Kustomizations Not Suspended" in gitops_expr assert "HelmReleases Not Suspended" in gitops_expr + pvc_backup_expr = panels_by_title["PVC Backup Health / Age"]["targets"][0]["expr"] + assert "backup-telemetry-missing" in pvc_backup_expr + assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr + def test_render_configmap_writes(tmp_path): mod = load_module() diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 9b598c14..62b813aa 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1300,8 +1300,9 @@ "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, - "fillOpacity": 70, + "fillOpacity": 55, "gradientMode": "none", "showPoints": "never", "spanNulls": true @@ -1862,8 +1863,9 @@ }, { "id": 141, - "type": "timeseries", - "title": "Fan History (0-10)", + "type": "state-timeline", + "title": "Fan Intensity History", + "description": "Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1876,40 +1878,62 @@ }, "targets": [ { + "expr": "label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"}), \"fan\", \"Outlet\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"}), \"fan\", \"Inlet - Inside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"}), \"fan\", \"Inlet - Outside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"}), \"fan\", \"Interior\", \"__name__\", \".*\")", "refId": "A", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})", - "legendFormat": "Outlet" - }, - { - "refId": "B", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})", - "legendFormat": "Inlet - Inside" - }, - { - "refId": "C", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})", - "legendFormat": "Inlet - Outside" - }, - { - "refId": "D", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})", - "legendFormat": "Interior" + "legendFormat": "{{fan}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-blue", + "value": null + }, + { + "color": "dark-green", + "value": 3 + }, + { + "color": "dark-yellow", + "value": 6 + }, + { + "color": "dark-orange", + "value": 8 + }, + { + "color": "dark-red", + "value": 10 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "max": 10 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, "links": [ @@ -2409,8 +2433,9 @@ }, { "id": 150, - "type": "bargauge", + "type": "state-timeline", "title": "GitOps Health", + "description": "GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2419,14 +2444,13 @@ "h": 6, "w": 6, "x": 15, - "y": 13 + "y": 7 }, "targets": [ { - "expr": "sort(label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\"))", + "expr": "label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\")", "refId": "A", - "legendFormat": "{{signal}}", - "instant": true + "legendFormat": "{{signal}}" } ], "fieldConfig": { @@ -2435,8 +2459,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2462,19 +2484,26 @@ } ] }, - "decimals": 0 + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } }, "links": [ @@ -2483,18 +2512,6 @@ "url": "/d/atlas-gitops", "targetBlank": true } - ], - "description": "Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.", - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "asc" - } - } ] }, { @@ -2617,6 +2634,7 @@ "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": 70, "gradientMode": "none", @@ -2676,8 +2694,9 @@ }, { "id": 46, - "type": "bargauge", + "type": "state-timeline", "title": "Gate Checks Passing by Suite", + "description": "Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2686,14 +2705,13 @@ "h": 6, "w": 6, "x": 15, - "y": 7 + "y": 13 }, "targets": [ { - "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)))", + "expr": "(100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1))", "refId": "A", - "legendFormat": "{{suite}}", - "instant": true + "legendFormat": "{{suite}}" } ], "fieldConfig": { @@ -2702,8 +2720,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2729,19 +2745,26 @@ } ] }, - "decimals": 0 + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } }, "links": [ @@ -2750,19 +2773,7 @@ "url": "/d/atlas-testing", "targetBlank": true } - ], - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "asc" - } - } - ], - "description": "Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing." + ] }, { "id": 142, @@ -2994,7 +3005,7 @@ }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() vector(0))", + "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() ((label_replace(label_replace(vector(999), \"namespace\", \"maintenance\", \"__name__\", \".*\"), \"pvc\", \"backup-telemetry-missing\", \"__name__\", \".*\")) unless on() ((count({__name__=~\"pvc_backup_(count|last_success_timestamp_seconds|health_reason)\",driver=\"restic\"})) > 0)))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -4365,7 +4376,7 @@ "refresh": "1m", "links": [ { - "title": "Open Atlas Testing", + "title": "Atlas Testing", "url": "/d/atlas-testing", "targetBlank": true } diff --git a/services/monitoring/dashboards/atlas-power.json b/services/monitoring/dashboards/atlas-power.json index 412c411c..521a5a4d 100644 --- a/services/monitoring/dashboards/atlas-power.json +++ b/services/monitoring/dashboards/atlas-power.json @@ -253,6 +253,7 @@ "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": 70, "gradientMode": "none", @@ -618,7 +619,7 @@ { "id": 6, "type": "timeseries", - "title": "Fan History (0-10)", + "title": "Fan Intensity History", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a8b4eb51..4f43f56f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1309,8 +1309,9 @@ data: "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, - "fillOpacity": 70, + "fillOpacity": 55, "gradientMode": "none", "showPoints": "never", "spanNulls": true @@ -1871,8 +1872,9 @@ data: }, { "id": 141, - "type": "timeseries", - "title": "Fan History (0-10)", + "type": "state-timeline", + "title": "Fan Intensity History", + "description": "Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1885,40 +1887,62 @@ data: }, "targets": [ { + "expr": "label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"}), \"fan\", \"Outlet\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"}), \"fan\", \"Inlet - Inside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"}), \"fan\", \"Inlet - Outside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"}), \"fan\", \"Interior\", \"__name__\", \".*\")", "refId": "A", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})", - "legendFormat": "Outlet" - }, - { - "refId": "B", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})", - "legendFormat": "Inlet - Inside" - }, - { - "refId": "C", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})", - "legendFormat": "Inlet - Outside" - }, - { - "refId": "D", - "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})", - "legendFormat": "Interior" + "legendFormat": "{{fan}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-blue", + "value": null + }, + { + "color": "dark-green", + "value": 3 + }, + { + "color": "dark-yellow", + "value": 6 + }, + { + "color": "dark-orange", + "value": 8 + }, + { + "color": "dark-red", + "value": 10 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "max": 10 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, "links": [ @@ -2418,8 +2442,9 @@ data: }, { "id": 150, - "type": "bargauge", + "type": "state-timeline", "title": "GitOps Health", + "description": "GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2428,14 +2453,13 @@ data: "h": 6, "w": 6, "x": 15, - "y": 13 + "y": 7 }, "targets": [ { - "expr": "sort(label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\"))", + "expr": "label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\")", "refId": "A", - "legendFormat": "{{signal}}", - "instant": true + "legendFormat": "{{signal}}" } ], "fieldConfig": { @@ -2444,8 +2468,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2471,19 +2493,26 @@ data: } ] }, - "decimals": 0 + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } }, "links": [ @@ -2492,18 +2521,6 @@ data: "url": "/d/atlas-gitops", "targetBlank": true } - ], - "description": "Compact GitOps health: readiness plus suspension health for Kustomizations and HelmReleases.", - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "asc" - } - } ] }, { @@ -2626,6 +2643,7 @@ data: "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": 70, "gradientMode": "none", @@ -2685,8 +2703,9 @@ data: }, { "id": 46, - "type": "bargauge", + "type": "state-timeline", "title": "Gate Checks Passing by Suite", + "description": "Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2695,14 +2714,13 @@ data: "h": 6, "w": 6, "x": 15, - "y": 7 + "y": 13 }, "targets": [ { - "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)))", + "expr": "(100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"} > bool 0))), 1))", "refId": "A", - "legendFormat": "{{suite}}", - "instant": true + "legendFormat": "{{suite}}" } ], "fieldConfig": { @@ -2711,8 +2729,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2738,19 +2754,26 @@ data: } ] }, - "decimals": 0 + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } }, "links": [ @@ -2759,19 +2782,7 @@ data: "url": "/d/atlas-testing", "targetBlank": true } - ], - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "asc" - } - } - ], - "description": "Percent of current gate dimensions passing per suite. There are seven gate dimensions, so 85.7% means one gate is failing." + ] }, { "id": 142, @@ -3003,7 +3014,7 @@ data: }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() vector(0))", + "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() ((label_replace(label_replace(vector(999), \"namespace\", \"maintenance\", \"__name__\", \".*\"), \"pvc\", \"backup-telemetry-missing\", \"__name__\", \".*\")) unless on() ((count({__name__=~\"pvc_backup_(count|last_success_timestamp_seconds|health_reason)\",driver=\"restic\"})) > 0)))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -4374,7 +4385,7 @@ data: "refresh": "1m", "links": [ { - "title": "Open Atlas Testing", + "title": "Atlas Testing", "url": "/d/atlas-testing", "targetBlank": true } diff --git a/services/monitoring/grafana-dashboard-power.yaml b/services/monitoring/grafana-dashboard-power.yaml index cd5f315e..858cf699 100644 --- a/services/monitoring/grafana-dashboard-power.yaml +++ b/services/monitoring/grafana-dashboard-power.yaml @@ -262,6 +262,7 @@ data: "custom": { "drawStyle": "bars", "barAlignment": 0, + "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": 70, "gradientMode": "none", @@ -627,7 +628,7 @@ data: { "id": 6, "type": "timeseries", - "title": "Fan History (0-10)", + "title": "Fan Intensity History", "datasource": { "type": "prometheus", "uid": "atlas-vm"