diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index d5ff261e..4e4435f4 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1242,6 +1242,7 @@ def bargauge_panel( instant=False, overrides=None, data_links=None, + include_color=True, ): """Return a bar gauge panel with label-aware reduction.""" cleaned_expr = expr.strip() @@ -1250,6 +1251,26 @@ def bargauge_panel( expr = f"sort_desc({expr})" elif sort_order == "asc": expr = f"sort({expr})" + defaults = {} + if include_color: + defaults["color"] = {"mode": "thresholds"} + defaults.update( + { + "unit": unit, + "min": 0, + "max": 100 if unit == "percent" else None, + "thresholds": thresholds + or { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 70}, + {"color": "red", "value": 85}, + ], + }, + } + ) panel = { "id": panel_id, "type": "bargauge", @@ -1265,22 +1286,7 @@ def bargauge_panel( } ], "fieldConfig": { - "defaults": { - "color": {"mode": "thresholds"}, - "unit": unit, - "min": 0, - "max": 100 if unit == "percent" else None, - "thresholds": thresholds - or { - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 50}, - {"color": "orange", "value": 70}, - {"color": "red", "value": 85}, - ], - }, - }, + "defaults": defaults, "overrides": [], }, "options": { @@ -1368,6 +1374,11 @@ def link_to(uid): ] +def overview_link_to(uid): + """Return the historical Overview dashboard link label.""" + return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] + + # --------------------------------------------------------------------------- # Dashboard builders # --------------------------------------------------------------------------- @@ -1375,6 +1386,52 @@ def link_to(uid): def build_overview(): panels = [] + overview_link = overview_link_to + climate_drop_labels = "job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group" + climate_temp_series = f"max without ({climate_drop_labels}) (typhon_temperature_celsius != 0)" + climate_humidity_series = f"max without ({climate_drop_labels}) (typhon_relative_humidity_percent != 0)" + climate_pressure_series = f"max without ({climate_drop_labels}) (typhon_vpd_kpa != 0)" + overview_pvc_backup_age = ( + 'max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) ' + 'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) ' + '* (pvc_backup_count{driver="restic"} > bool 0)) * 999))) or on() vector(0)' + ) + + def overview_metric_pair_expr(first_expr, first_name, second_expr, second_name): + return ( + f'label_replace({first_expr}, "metric", "{first_name}", "__name__", ".*") ' + f'or label_replace({second_expr}, "metric", "{second_name}", "__name__", ".*")' + ) + + def overview_platform_test_success_targets(): + suites = [ + ("ariadne", "ariadne"), + ("metis", "metis"), + ("ananke", "ananke"), + ("atlasbot", "atlasbot"), + ("lesavka", "lesavka"), + ("pegasus", "pegasus|pegasus-health|pegasus_health"), + ("soteria", "soteria"), + ("titan-iac", "titan-iac|titan_iac"), + ("bstein-home", "bstein-home|bstein_home"), + ("arcanagon", "arcanagon"), + ("data-prepper", "data-prepper|data_prepper"), + ] + targets = [] + for index, (legend, suite_regex) in enumerate(suites): + total = f'sum(increase(platform_quality_gate_runs_total{{suite=~"{suite_regex}"}}[1h]))' + passed = ( + f'sum(increase(platform_quality_gate_runs_total{{suite=~"{suite_regex}",' + f'status=~"{PLATFORM_TEST_SUCCESS_STATUS}"}}[1h]))' + ) + targets.append( + { + "refId": chr(ord("A") + index), + "expr": f"(100 * ({passed}) / clamp_min(({total}), 1)) and on() (({total}) > 0) or on() vector(0)", + "legendFormat": legend, + } + ) + return targets age_thresholds = { "mode": "absolute", @@ -1415,7 +1472,7 @@ def build_overview(): {"color": "red", "value": 3}, ], }, - "links": link_to("atlas-pods"), + "links": overview_link("atlas-pods"), }, { "id": 5, @@ -1431,7 +1488,7 @@ def build_overview(): {"color": "red", "value": 3}, ], }, - "links": link_to("atlas-pods"), + "links": overview_link("atlas-pods"), }, { "id": 27, @@ -1459,7 +1516,7 @@ def build_overview(): {"color": "red", "value": 3}, ], }, - "links": link_to("atlas-pods"), + "links": overview_link("atlas-pods"), }, { "id": 6, @@ -1475,7 +1532,7 @@ def build_overview(): {"color": "red", "value": 3}, ], }, - "links": link_to("atlas-pods"), + "links": overview_link("atlas-pods"), }, { "id": 1, @@ -1560,7 +1617,7 @@ def build_overview(): text_mode="name_and_value" if is_hottest_panel else "value", legend="{{node}}" if is_hottest_panel else None, instant=is_hottest_panel, - links=link_to("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"), + links=overview_link("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"), ) ) @@ -1591,189 +1648,259 @@ def build_overview(): {"color": "green", "value": 98}, ], } - status_mapping = [ - { - "type": "value", - "options": { - "0": {"text": "⚡ Charging"}, - "1": {"text": "🔋 Discharging"}, - }, - } - ] - - panels.append( - stat_panel( - 40, - "UPS Current Load", + ups_text = {"titleSize": 14, "valueSize": 30} + for panel_id, title, draw_expr, runtime_expr, y_pos in [ + (40, "Pyrphoros UPS Current", ANANKE_UPS_DRAW_WATTS_DB, ANANKE_UPS_RUNTIME_DB, 7), + (144, "Statera UPS Current", ANANKE_UPS_DRAW_WATTS_TETHYS, ANANKE_UPS_RUNTIME_TETHYS, 10), + ]: + panel = stat_panel( + panel_id, + title, None, - {"h": 6, "w": 4, "x": 0, "y": 12}, + {"h": 3, "w": 6, "x": 0, "y": y_pos}, unit="none", - decimals=1, text_mode="name_and_value", targets=[ - {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True}, - {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True}, - {"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True}, - {"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True}, - {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True}, - {"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True}, + { + "expr": overview_metric_pair_expr(draw_expr, "Draw", runtime_expr, "Runtime"), + "refId": "A", + "legendFormat": "{{metric}}", + "instant": True, + } ], field_overrides=[ - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Draw (W)"}, - "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], - }, - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)"}, - "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], - }, - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"}, - "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], - }, - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"}, - "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], - }, - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Status"}, - "properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], - }, - { - "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Status"}, - "properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], - }, + {"matcher": {"id": "byName", "options": "Draw"}, "properties": [{"id": "unit", "value": "watt"}]}, + {"matcher": {"id": "byName", "options": "Runtime"}, "properties": [{"id": "unit", "value": "s"}]}, ], - orientation="horizontal", - wide_layout=True, - links=link_to("atlas-power"), - description="Per-UPS live snapshot: current draw, discharge, and charging/discharging status.", + links=overview_link("atlas-power"), ) - ) + panel["options"]["text"] = ups_text + panels.append(panel) + panels.append( timeseries_panel( 41, "UPS History (Power Draw)", None, - {"h": 6, "w": 4, "x": 4, "y": 12}, + {"h": 6, "w": 6, "x": 6, "y": 7}, unit="watt", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME}, {"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME}, ], - legend_display="list", - legend_placement="bottom", - links=link_to("atlas-power"), + legend_display="table", + legend_placement="right", + links=overview_link("atlas-power"), ) ) - panels.append( - stat_panel( - 42, - "Current Climate", - None, - {"h": 6, "w": 4, "x": 8, "y": 12}, - unit="none", - decimals=2, - text_mode="value", - targets=[ - {"refId": "A", "expr": CLIMATE_TEMP_MAX, "legendFormat": "Tent Temp (°C)", "instant": True}, - {"refId": "B", "expr": CLIMATE_PRESSURE_CURRENT, "legendFormat": "Tent VPD (kPa)", "instant": True}, - {"refId": "C", "expr": CLIMATE_HUMIDITY_MAX, "legendFormat": "Tent RH (%)", "instant": True}, - {"refId": "D", "expr": CLIMATE_DEWPOINT_CURRENT, "legendFormat": "Dew Point (°C)", "instant": True}, - ], - field_overrides=[ - {"matcher": {"id": "byName", "options": "Tent Temp (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]}, - {"matcher": {"id": "byName", "options": "Tent VPD (kPa)"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]}, - {"matcher": {"id": "byName", "options": "Tent RH (%)"}, "properties": [{"id": "unit", "value": "percent"}]}, - {"matcher": {"id": "byName", "options": "Dew Point (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]}, - ], - links=link_to("atlas-power"), - description="Current tent temperature, humidity, VPD, and dew point.", - orientation="horizontal", - wide_layout=True, - ) + temp_panel = stat_panel( + 42, + "Current Enclosure Temperature", + None, + {"h": 3, "w": 6, "x": 0, "y": 13}, + unit="none", + text_mode="name_and_value", + targets=[ + { + "expr": overview_metric_pair_expr( + f"max({climate_temp_series}) or on() vector(0)", + "°C", + f"max(({climate_temp_series}) * 9 / 5 + 32) or on() vector(0)", + "°F", + ), + "refId": "A", + "legendFormat": "{{metric}}", + "instant": True, + } + ], + field_overrides=[ + {"matcher": {"id": "byName", "options": "°C"}, "properties": [{"id": "unit", "value": "celsius"}]}, + {"matcher": {"id": "byName", "options": "°F"}, "properties": [{"id": "unit", "value": "fahrenheit"}]}, + ], + links=overview_link("atlas-power"), ) + temp_panel["options"]["text"] = ups_text + panels.append(temp_panel) + climate_panel = stat_panel( + 143, + "Current Enclosure Climate", + None, + {"h": 3, "w": 6, "x": 0, "y": 16}, + unit="none", + text_mode="name_and_value", + targets=[ + { + "expr": overview_metric_pair_expr( + f"max({climate_humidity_series}) or on() vector(0)", + "%RH", + f"max({climate_pressure_series}) or on() vector(0)", + "kPa", + ), + "refId": "A", + "legendFormat": "{{metric}}", + "instant": True, + } + ], + field_overrides=[ + {"matcher": {"id": "byName", "options": "%RH"}, "properties": [{"id": "unit", "value": "suffix:%RH"}]}, + {"matcher": {"id": "byName", "options": "kPa"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]}, + ], + links=overview_link("atlas-power"), + ) + climate_panel["options"]["text"] = ups_text + panels.append(climate_panel) panels.append( timeseries_panel( 43, - "Climate History", + "Enclosure Climate History", None, - {"h": 6, "w": 4, "x": 12, "y": 12}, - unit="celsius", + {"h": 6, "w": 6, "x": 6, "y": 13}, + unit="none", targets=[ - {"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"}, - {"refId": "B", "expr": CLIMATE_HUMIDITY_SERIES, "legendFormat": "Humidity (%)"}, - {"refId": "C", "expr": CLIMATE_PRESSURE_SERIES, "legendFormat": "VPD (kPa)"}, - {"refId": "D", "expr": CLIMATE_DEWPOINT_SERIES, "legendFormat": "Dew Point (°C)"}, + {"refId": "A", "expr": climate_temp_series, "legendFormat": "C"}, + {"refId": "B", "expr": climate_humidity_series, "legendFormat": "RH"}, + {"refId": "C", "expr": climate_pressure_series, "legendFormat": "P"}, + {"refId": "D", "expr": f"(min_over_time({climate_temp_series}[$__range]) - 0.08)", "legendFormat": "C bound min"}, + {"refId": "E", "expr": f"(max_over_time({climate_temp_series}[$__range]) + 0.08)", "legendFormat": "C bound max"}, + {"refId": "F", "expr": f"clamp_min((min_over_time({climate_humidity_series}[$__range]) - 0.35), 0)", "legendFormat": "RH bound min"}, + {"refId": "G", "expr": f"clamp_max((max_over_time({climate_humidity_series}[$__range]) + 0.35), 100)", "legendFormat": "RH bound max"}, + {"refId": "H", "expr": f"clamp_min((min_over_time({climate_pressure_series}[$__range]) - 0.03), 0)", "legendFormat": "P bound min"}, + {"refId": "I", "expr": f"(max_over_time({climate_pressure_series}[$__range]) + 0.03)", "legendFormat": "P bound max"}, ], field_overrides=[ { - "matcher": {"id": "byName", "options": "Humidity (%)"}, + "matcher": {"id": "byName", "options": "C"}, "properties": [ - {"id": "unit", "value": "percent"}, + {"id": "unit", "value": "suffix:°C"}, + {"id": "decimals", "value": 2}, + {"id": "custom.axisPlacement", "value": "left"}, + {"id": "custom.axisCenteredZero", "value": False}, ], }, { - "matcher": {"id": "byName", "options": "VPD (kPa)"}, + "matcher": {"id": "byRegexp", "options": "C bound .*"}, "properties": [ - {"id": "unit", "value": "none"}, - {"id": "custom.axisPlacement", "value": "right"}, - {"id": "custom.axisLabel", "value": "kPa"}, - {"id": "decimals", "value": 2}, + {"id": "unit", "value": "suffix:°C"}, + {"id": "custom.axisPlacement", "value": "left"}, + {"id": "custom.axisCenteredZero", "value": False}, + {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, + {"id": "custom.lineWidth", "value": 0}, + {"id": "custom.fillOpacity", "value": 0}, + {"id": "custom.showPoints", "value": "never"}, + {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, ], - } + }, + { + "matcher": {"id": "byName", "options": "RH"}, + "properties": [ + {"id": "unit", "value": "suffix:%"}, + {"id": "decimals", "value": 2}, + {"id": "custom.axisPlacement", "value": "right"}, + {"id": "custom.axisCenteredZero", "value": False}, + ], + }, + { + "matcher": {"id": "byRegexp", "options": "RH bound .*"}, + "properties": [ + {"id": "unit", "value": "suffix:%"}, + {"id": "custom.axisPlacement", "value": "right"}, + {"id": "custom.axisCenteredZero", "value": False}, + {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, + {"id": "custom.lineWidth", "value": 0}, + {"id": "custom.fillOpacity", "value": 0}, + {"id": "custom.showPoints", "value": "never"}, + {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, + ], + }, + { + "matcher": {"id": "byName", "options": "P"}, + "properties": [ + {"id": "unit", "value": "suffix:kPa"}, + {"id": "custom.axisPlacement", "value": "right"}, + {"id": "decimals", "value": 2}, + {"id": "custom.axisCenteredZero", "value": False}, + ], + }, + { + "matcher": {"id": "byRegexp", "options": "P bound .*"}, + "properties": [ + {"id": "unit", "value": "suffix:kPa"}, + {"id": "custom.axisPlacement", "value": "right"}, + {"id": "custom.axisCenteredZero", "value": False}, + {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, + {"id": "custom.lineWidth", "value": 0}, + {"id": "custom.fillOpacity", "value": 0}, + {"id": "custom.showPoints", "value": "never"}, + {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, + ], + }, ], legend_display="list", legend_placement="bottom", - links=link_to("atlas-power"), - description="Two-axis chart: tent temperature/humidity/dew point (left axis) and VPD in kPa (right axis).", + links=overview_link("atlas-power"), + description="Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible.", ) ) - panels.append( - stat_panel( - 140, - "Fan Activity", - None, - {"h": 6, "w": 4, "x": 16, "y": 12}, - unit="none", - decimals=0, - text_mode="name_and_value", - targets=[ - {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True}, - {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, - {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, - {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True}, + panels[-1]["fieldConfig"]["defaults"]["custom"] = { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "never", + "spanNulls": True, + } + fan_current = ( + f'label_replace((round(max({climate_drop_labels and "max without (" + climate_drop_labels + ")"} (typhon_fan_speed_level{{port="1"}})) or on() vector(0))), "metric", "Outlet", "__name__", ".*") ' + f'or label_replace((round(max({"max without (" + climate_drop_labels + ")"} (typhon_fan_speed_level{{port="2"}})) or on() vector(0))), "metric", "Inlet - In", "__name__", ".*") ' + f'or label_replace((round(max({"max without (" + climate_drop_labels + ")"} (typhon_fan_speed_level{{port="3"}})) or on() vector(0))), "metric", "Inlet - Out", "__name__", ".*") ' + f'or label_replace((round(max({"max without (" + climate_drop_labels + ")"} (typhon_fan_speed_level{{port="4"}})) or on() vector(0))), "metric", "Interior", "__name__", ".*")' + ) + fan_panel = stat_panel( + 140, + "Fan Activity", + None, + {"h": 6, "w": 6, "x": 12, "y": 13}, + unit="none", + decimals=0, + text_mode="name_and_value", + targets=[{"expr": fan_current, "refId": "A", "legendFormat": "{{metric}}", "instant": True}], + field_overrides=[ + {"matcher": {"id": "byName", "options": "Outlet"}, "properties": [{"id": "decimals", "value": 0}]}, + {"matcher": {"id": "byName", "options": "Inlet - In"}, "properties": [{"id": "decimals", "value": 0}]}, + {"matcher": {"id": "byName", "options": "Inlet - Out"}, "properties": [{"id": "decimals", "value": 0}]}, + {"matcher": {"id": "byName", "options": "Interior"}, "properties": [{"id": "decimals", "value": 0}]}, + ], + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 7}, + {"color": "red", "value": 9}, ], - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 7}, - {"color": "red", "value": 9}, - ], - }, - orientation="horizontal", - wide_layout=True, - links=link_to("atlas-power"), - ) + }, + orientation="vertical", + wide_layout=False, + links=overview_link("atlas-power"), ) + fan_panel["options"]["text"] = {"valueSize": 26} + panels.append(fan_panel) panels.append( timeseries_panel( 141, "Fan History (0-10)", None, - {"h": 6, "w": 4, "x": 20, "y": 12}, + {"h": 6, "w": 6, "x": 18, "y": 13}, unit="none", max_value=10, targets=[ - {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"}, - {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, - {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, - {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"}, + {"refId": "A", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}})', "legendFormat": "Outlet"}, + {"refId": "B", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}})', "legendFormat": "Inlet - Inside"}, + {"refId": "C", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}})', "legendFormat": "Inlet - Outside"}, + {"refId": "D", "expr": f'max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}})', "legendFormat": "Interior"}, ], - legend_display="list", - legend_placement="bottom", - links=link_to("atlas-power"), + legend_display="table", + legend_placement="right", + links=overview_link("atlas-power"), ) ) @@ -1782,14 +1909,15 @@ def build_overview(): 44, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 5, "w": 6, "x": 0, "y": 7}, + {"h": 5, "w": 8, "x": 0, "y": 32}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=12, decimals=2, - links=link_to("atlas-jobs"), + links=overview_link("atlas-jobs"), + include_color=False, ) ) panels.append( @@ -1798,10 +1926,10 @@ def build_overview(): "type": "timeseries", "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, - "gridPos": {"h": 5, "w": 6, "x": 6, "y": 7}, + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 7}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, + {"expr": f"{ARIADNE_TASK_ATTEMPTS_SERIES} or on() vector(0)", "refId": "A", "legendFormat": "Attempts"}, + {"expr": f"{ARIADNE_TASK_FAILURES_SERIES} or on() vector(0)", "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, @@ -1824,20 +1952,20 @@ def build_overview(): "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, }, - "links": link_to("atlas-jobs"), + "links": overview_link("atlas-jobs"), } ) test_success = timeseries_panel( 46, "Platform Test Success Rate", None, - {"h": 5, "w": 6, "x": 12, "y": 7}, + {"h": 6, "w": 6, "x": 18, "y": 7}, unit="percent", - targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS, + targets=overview_platform_test_success_targets(), legend_display="table", legend_placement="right", legend_calcs=["lastNotNull"], - links=link_to("atlas-jobs"), + links=overview_link("atlas-testing"), ) test_success["fieldConfig"]["defaults"]["min"] = 0 test_success["fieldConfig"]["defaults"]["max"] = 100 @@ -1855,12 +1983,94 @@ def build_overview(): "Per-run interval pass points (0-100) for each software suite over the last 7 days. Points are connected to show trend; missing-run intervals are ignored." ) panels.append(test_success) + for panel_id, title, metric, x_pos, description in [ + ( + 142, + "Jenkins Last Success (h, newest first)", + "ariadne_jenkins_build_weather_job_last_success_timestamp_seconds", + 8, + "Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.", + ), + ( + 243, + "Jenkins Last Failure (h, newest first)", + "ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds", + 12, + "Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.", + ), + ]: + base_expr = f"min by (exported_job,job_url,weather_icon) ((time() - {metric}) / 3600)" + topk_expr = f"sort(bottomk(6, {base_expr}))" + success_expr = ( + f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) ' + '(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), ' + '"run_state", "ok", "exported_job", ".*")' + ) + failure_expr = ( + f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) ' + '(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), ' + '"run_state", "bad", "exported_job", ".*")' + ) + panels.append( + { + "id": panel_id, + "type": "stat", + "title": title, + "datasource": PROM_DS, + "gridPos": {"h": 5, "w": 4, "x": x_pos, "y": 32}, + "targets": [ + { + "refId": "A", + "expr": f"sort(({success_expr}) or ({failure_expr}))", + "instant": True, + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "decimals": 1, + "min": 0, + "displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}", + "links": [ + { + "title": "Open Jenkins job", + "url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/", + "targetBlank": True, + } + ], + }, + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": '.*run_state="ok".*'}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}], + }, + { + "matcher": {"id": "byRegexp", "options": '.*run_state="bad".*'}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], + }, + ], + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "left", + "orientation": "horizontal", + "wideLayout": True, + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "textMode": "name_and_value", + "text": {"titleSize": 11, "valueSize": 11}, + }, + "transformations": [{"id": "sortBy", "options": {"fields": ["Value"], "order": "asc"}}], + "links": overview_link("atlas-jobs"), + "description": description, + } + ) panels.append( bargauge_panel( 47, "PVC Backup Health / Age", - PVC_BACKUP_AGE_HOURS_BY_PVC, - {"h": 5, "w": 6, "x": 18, "y": 7}, + overview_pvc_backup_age, + {"h": 5, "w": 8, "x": 16, "y": 32}, unit="h", instant=True, legend="{{namespace}}/{{pvc}}", @@ -1874,11 +2084,12 @@ def build_overview(): {"color": "red", "value": 50}, ], }, + include_color=False, ) ) - panels[-1]["links"] = link_to("atlas-storage") + panels[-1]["links"] = overview_link("atlas-storage") panels[-1]["description"] = ( - "Oldest successful backup age in hours by PVC with nightly cadence thresholds (green <=20h, yellow <40h, orange <50h, red >=50h). PVCs with missing or unhealthy backup state are forced to 999h so critical bars stay visible." + "Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." ) panels.append( @@ -1886,9 +2097,9 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 4, "x": 0, "y": 18}, + {"h": 2, "w": 4, "x": 0, "y": 19}, unit="none", - links=link_to("atlas-mail"), + links=overview_link("atlas-mail"), ) ) panels.append( @@ -1897,7 +2108,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 4, "x": 8, "y": 18}, + "gridPos": {"h": 2, "w": 4, "x": 8, "y": 19}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1935,7 +2146,7 @@ def build_overview(): "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", }, - "links": link_to("atlas-mail"), + "links": overview_link("atlas-mail"), } ) panels.append( @@ -1943,11 +2154,11 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 4, "x": 4, "y": 18}, + {"h": 2, "w": 4, "x": 4, "y": 19}, unit="percent", thresholds=mail_success_thresholds, decimals=1, - links=link_to("atlas-mail"), + links=overview_link("atlas-mail"), ) ) panels.append( @@ -1955,11 +2166,11 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 4, "x": 12, "y": 18}, + {"h": 2, "w": 4, "x": 12, "y": 19}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, - links=link_to("atlas-mail"), + links=overview_link("atlas-mail"), ) ) panels.append( @@ -1967,7 +2178,7 @@ def build_overview(): 34, "Postgres Connections Used", POSTGRES_CONN_USED, - {"h": 2, "w": 4, "x": 16, "y": 18}, + {"h": 2, "w": 4, "x": 16, "y": 19}, decimals=0, text_mode="name_and_value", legend="{{conn}}", @@ -1979,7 +2190,7 @@ def build_overview(): 35, "Postgres Hottest Connections", POSTGRES_CONN_HOTTEST, - {"h": 2, "w": 4, "x": 20, "y": 18}, + {"h": 2, "w": 4, "x": 20, "y": 19}, unit="none", decimals=0, text_mode="name_and_value", @@ -2029,13 +2240,13 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 39}, + {"h": 12, "w": 12, "x": 0, "y": 44}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - links=link_to("atlas-nodes"), + links=overview_link("atlas-nodes"), ) ) panels.append( @@ -2043,13 +2254,13 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 39}, + {"h": 12, "w": 12, "x": 12, "y": 44}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - links=link_to("atlas-nodes"), + links=overview_link("atlas-nodes"), ) ) @@ -2058,7 +2269,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 51}, + {"h": 10, "w": 12, "x": 0, "y": 56}, unit="percent", legend="{{node}}", legend_display="table", @@ -2070,7 +2281,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 51}, + {"h": 10, "w": 12, "x": 12, "y": 56}, unit="percent", legend="{{node}}", legend_display="table", @@ -2083,7 +2294,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 61}, + {"h": 10, "w": 12, "x": 0, "y": 66}, ) ) panels.append( @@ -2091,7 +2302,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 61}, + {"h": 10, "w": 12, "x": 12, "y": 66}, unit="none", limit=12, decimals=0, @@ -2105,6 +2316,7 @@ def build_overview(): ], }, instant=True, + include_color=False, ) ) @@ -2113,12 +2325,12 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 32}, + {"h": 7, "w": 8, "x": 0, "y": 37}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", legend_placement="bottom", - links=link_to("atlas-network"), + links=overview_link("atlas-network"), ) ) panels.append( @@ -2126,12 +2338,12 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 32}, + {"h": 7, "w": 8, "x": 8, "y": 37}, unit="Bps", legend="Egress (Traefik)", legend_display="list", legend_placement="bottom", - links=link_to("atlas-network"), + links=overview_link("atlas-network"), ) ) panels.append( @@ -2139,12 +2351,12 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 32}, + {"h": 7, "w": 8, "x": 16, "y": 37}, unit="Bps", legend="Internal traffic", legend_display="list", legend_placement="bottom", - links=link_to("atlas-network"), + links=overview_link("atlas-network"), ) ) @@ -2153,14 +2365,14 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 71}, + {"h": 16, "w": 12, "x": 0, "y": 76}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="30d", - links=link_to("atlas-storage"), + links=overview_link("atlas-storage"), ) ) panels.append( @@ -2168,14 +2380,14 @@ def build_overview(): 22, "Nodes Closest to Full Astraios Disks", astraios_usage_expr(), - {"h": 16, "w": 12, "x": 12, "y": 71}, + {"h": 16, "w": 12, "x": 12, "y": 76}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="1w", - links=link_to("atlas-storage"), + links=overview_link("atlas-storage"), ) ) return { @@ -2197,13 +2409,7 @@ def build_overview(): }, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", - "links": [ - { - "title": "Atlas Testing (Internal)", - "url": "/d/atlas-jobs", - "targetBlank": False, - } - ], + "links": [], } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 9bd48211..a20bff4c 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -479,7 +479,7 @@ "overrides": [] }, "options": { - "displayMode": "basic", + "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index a5738409..419b2839 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -488,7 +488,7 @@ data: "overrides": [] }, "options": { - "displayMode": "basic", + "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [