monitoring: bound atlas availability query
This commit is contained in:
parent
521eda1c00
commit
32ffe30145
@ -312,7 +312,7 @@ STUCK_TERMINATING_EXPR = (
|
|||||||
')) '
|
')) '
|
||||||
"or on() vector(0)"
|
"or on() vector(0)"
|
||||||
)
|
)
|
||||||
UPTIME_WINDOW = "365d"
|
UPTIME_WINDOW = "30d"
|
||||||
# Keep the subquery step coarse so we don't request an excessive number of points.
|
# Keep the subquery step coarse so we don't request an excessive number of points.
|
||||||
UPTIME_STEP = "1h"
|
UPTIME_STEP = "1h"
|
||||||
TRAEFIK_READY_EXPR = (
|
TRAEFIK_READY_EXPR = (
|
||||||
@ -1433,13 +1433,15 @@ def build_overview():
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"title": "Atlas Availability",
|
"title": "Atlas Availability (30d)",
|
||||||
"expr": UPTIME_PERCENT_EXPR,
|
"expr": UPTIME_PERCENT_EXPR,
|
||||||
"kind": "stat",
|
"kind": "stat",
|
||||||
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
||||||
"unit": "percentunit",
|
"unit": "percentunit",
|
||||||
"decimals": 4,
|
"decimals": 4,
|
||||||
"text_mode": "value",
|
"text_mode": "value",
|
||||||
|
"instant": True,
|
||||||
|
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
@ -1509,14 +1511,16 @@ def build_overview():
|
|||||||
item["expr"],
|
item["expr"],
|
||||||
grid,
|
grid,
|
||||||
thresholds=item.get("thresholds"),
|
thresholds=item.get("thresholds"),
|
||||||
legend=None,
|
legend=None,
|
||||||
links=item.get("links"),
|
links=item.get("links"),
|
||||||
text_mode=item.get("text_mode", "value"),
|
text_mode=item.get("text_mode", "value"),
|
||||||
value_suffix=item.get("value_suffix"),
|
value_suffix=item.get("value_suffix"),
|
||||||
unit=item.get("unit", "none"),
|
unit=item.get("unit", "none"),
|
||||||
decimals=item.get("decimals"),
|
decimals=item.get("decimals"),
|
||||||
)
|
instant=item.get("instant", False),
|
||||||
)
|
description=item.get("description"),
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
panels.append(
|
panels.append(
|
||||||
gauge_panel(
|
gauge_panel(
|
||||||
|
|||||||
@ -50,6 +50,18 @@ def test_node_filter_and_expr_helpers():
|
|||||||
assert "node_memory_MemAvailable_bytes" in mem_expr
|
assert "node_memory_MemAvailable_bytes" in mem_expr
|
||||||
|
|
||||||
|
|
||||||
|
def test_overview_availability_panel_is_recent_and_instant():
|
||||||
|
mod = load_module()
|
||||||
|
dashboard = mod.build_overview()
|
||||||
|
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
||||||
|
|
||||||
|
assert panel["title"] == "Atlas Availability (30d)"
|
||||||
|
assert "[30d:1h]" in panel["targets"][0]["expr"]
|
||||||
|
assert "365d" not in panel["targets"][0]["expr"]
|
||||||
|
assert panel["targets"][0]["instant"] is True
|
||||||
|
assert "pre-metric history" in panel["description"]
|
||||||
|
|
||||||
|
|
||||||
def test_render_configmap_writes(tmp_path):
|
def test_render_configmap_writes(tmp_path):
|
||||||
mod = load_module()
|
mod = load_module()
|
||||||
mod.DASHBOARD_DIR = tmp_path / "dash"
|
mod.DASHBOARD_DIR = tmp_path / "dash"
|
||||||
|
|||||||
@ -213,7 +213,7 @@
|
|||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Atlas Availability",
|
"title": "Atlas Availability (30d)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -226,8 +226,9 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])",
|
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
||||||
"refId": "A"
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
@ -281,7 +282,8 @@
|
|||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
}
|
},
|
||||||
|
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -222,7 +222,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Atlas Availability",
|
"title": "Atlas Availability (30d)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -235,8 +235,9 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])",
|
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
||||||
"refId": "A"
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
@ -290,7 +291,8 @@ data:
|
|||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
}
|
},
|
||||||
|
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user