feature/atlas-monitoring #3
@ -165,22 +165,22 @@ def node_io_expr(scope=""):
|
|||||||
return scoped_node_expr(base, scope)
|
return scoped_node_expr(base, scope)
|
||||||
|
|
||||||
|
|
||||||
def namespace_cpu_share_expr():
|
def namespace_share_expr(resource_expr):
|
||||||
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
||||||
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
|
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
||||||
return f"100 * ( {selected} ) / {total}"
|
return f"100 * ( {selected} ) / {total}"
|
||||||
|
|
||||||
|
|
||||||
|
def namespace_cpu_share_expr():
|
||||||
|
return namespace_share_expr(NAMESPACE_CPU_RAW)
|
||||||
|
|
||||||
|
|
||||||
def namespace_ram_share_expr():
|
def namespace_ram_share_expr():
|
||||||
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
return namespace_share_expr(NAMESPACE_RAM_RAW)
|
||||||
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
|
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr():
|
def namespace_gpu_share_expr():
|
||||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
||||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||||
@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = (
|
|||||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
||||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
||||||
)
|
)
|
||||||
NAMESPACE_GPU_USAGE = (
|
NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
||||||
'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)'
|
|
||||||
)
|
|
||||||
NAMESPACE_GPU_RAW = (
|
NAMESPACE_GPU_RAW = (
|
||||||
"("
|
"("
|
||||||
+ NAMESPACE_GPU_USAGE
|
+ NAMESPACE_GPU_USAGE
|
||||||
+ ") or on(namespace) ("
|
+ ") or on(namespace) ("
|
||||||
+ NAMESPACE_GPU_ALLOC
|
+ NAMESPACE_CPU_RAW
|
||||||
+ " * 0)"
|
+ " * 0)"
|
||||||
)
|
)
|
||||||
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
|
NAMESPACE_GPU_WEIGHT = (
|
||||||
NAMESPACE_COMBINED_FILTER = (
|
"("
|
||||||
'topk(10, ('
|
+ NAMESPACE_GPU_ALLOC
|
||||||
|
+ ") or on(namespace) ("
|
||||||
+ NAMESPACE_CPU_RAW
|
+ NAMESPACE_CPU_RAW
|
||||||
+ ") + ("
|
+ " * 0)"
|
||||||
+ NAMESPACE_RAM_RAW
|
|
||||||
+ ' / 1e9) + ('
|
|
||||||
+ NAMESPACE_GPU_WEIGHT
|
|
||||||
+ " * 10))"
|
|
||||||
)
|
)
|
||||||
|
NAMESPACE_ACTIVITY_SCORE = (
|
||||||
|
"( "
|
||||||
|
+ NAMESPACE_CPU_RAW
|
||||||
|
+ " ) + ("
|
||||||
|
+ NAMESPACE_RAM_RAW
|
||||||
|
+ " / 1e9) + ("
|
||||||
|
+ NAMESPACE_GPU_WEIGHT
|
||||||
|
+ " * 100)"
|
||||||
|
)
|
||||||
|
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
NET_INGRESS_EXPR = (
|
TRAEFIK_NET_INGRESS = (
|
||||||
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||||
" or on() vector(0)"
|
" or on() vector(0)"
|
||||||
)
|
)
|
||||||
NET_EGRESS_EXPR = (
|
TRAEFIK_NET_EGRESS = (
|
||||||
|
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||||
|
" or on() vector(0)"
|
||||||
|
)
|
||||||
|
NET_TOTAL_EXPR = (
|
||||||
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||||
" or on() vector(0)"
|
" or on() vector(0)"
|
||||||
)
|
)
|
||||||
|
NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
|
||||||
|
NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
|
||||||
|
NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)"
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Panel factories
|
# Panel factories
|
||||||
@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid):
|
|||||||
"datasource": PROM_DS,
|
"datasource": PROM_DS,
|
||||||
"gridPos": grid,
|
"gridPos": grid,
|
||||||
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
||||||
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"color": {"mode": "palette-classic"},
|
||||||
|
},
|
||||||
|
"overrides": [],
|
||||||
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"legend": {"displayMode": "list", "placement": "right"},
|
"legend": {"displayMode": "list", "placement": "right"},
|
||||||
"pieType": "pie",
|
"pieType": "pie",
|
||||||
|
"displayLabels": ["percent"],
|
||||||
|
"tooltip": {"mode": "single"},
|
||||||
|
"colorScheme": "interpolateSpectral",
|
||||||
|
"colorBy": "value",
|
||||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -511,7 +533,6 @@ def build_overview():
|
|||||||
1,
|
1,
|
||||||
link_to("atlas-pods"),
|
link_to("atlas-pods"),
|
||||||
),
|
),
|
||||||
(6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
|
|
||||||
]
|
]
|
||||||
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
||||||
thresholds = None
|
thresholds = None
|
||||||
@ -591,12 +612,31 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
storage_panels = [
|
||||||
|
(23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||||
|
(24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||||
|
(25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||||
|
(26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||||
|
]
|
||||||
|
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||||
|
panels.append(
|
||||||
|
stat_panel(
|
||||||
|
panel_id,
|
||||||
|
title,
|
||||||
|
expr,
|
||||||
|
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
||||||
|
unit=unit,
|
||||||
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||||
|
links=link_to("atlas-storage"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
11,
|
11,
|
||||||
"Namespace CPU share",
|
"Namespace CPU share",
|
||||||
namespace_cpu_share_expr(),
|
namespace_cpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -604,7 +644,7 @@ def build_overview():
|
|||||||
12,
|
12,
|
||||||
"Namespace GPU share",
|
"Namespace GPU share",
|
||||||
namespace_gpu_share_expr(),
|
namespace_gpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -612,7 +652,7 @@ def build_overview():
|
|||||||
13,
|
13,
|
||||||
"Namespace RAM share",
|
"Namespace RAM share",
|
||||||
namespace_ram_share_expr(),
|
namespace_ram_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -622,7 +662,7 @@ def build_overview():
|
|||||||
14,
|
14,
|
||||||
"Worker node CPU",
|
"Worker node CPU",
|
||||||
node_cpu_expr(worker_filter),
|
node_cpu_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
{"h": 8, "w": 12, "x": 0, "y": 25},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -636,7 +676,7 @@ def build_overview():
|
|||||||
15,
|
15,
|
||||||
"Worker node RAM",
|
"Worker node RAM",
|
||||||
node_mem_expr(worker_filter),
|
node_mem_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
{"h": 8, "w": 12, "x": 12, "y": 25},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -651,7 +691,7 @@ def build_overview():
|
|||||||
16,
|
16,
|
||||||
"Control plane CPU",
|
"Control plane CPU",
|
||||||
node_cpu_expr(CONTROL_REGEX),
|
node_cpu_expr(CONTROL_REGEX),
|
||||||
{"h": 7, "w": 12, "x": 0, "y": 27},
|
{"h": 7, "w": 12, "x": 0, "y": 33},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -663,7 +703,7 @@ def build_overview():
|
|||||||
17,
|
17,
|
||||||
"Control plane RAM",
|
"Control plane RAM",
|
||||||
node_mem_expr(CONTROL_REGEX),
|
node_mem_expr(CONTROL_REGEX),
|
||||||
{"h": 7, "w": 12, "x": 12, "y": 27},
|
{"h": 7, "w": 12, "x": 12, "y": 33},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -676,9 +716,9 @@ def build_overview():
|
|||||||
18,
|
18,
|
||||||
"Cluster ingress throughput",
|
"Cluster ingress throughput",
|
||||||
NET_INGRESS_EXPR,
|
NET_INGRESS_EXPR,
|
||||||
{"h": 7, "w": 12, "x": 0, "y": 34},
|
{"h": 7, "w": 8, "x": 0, "y": 40},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="Ingress",
|
legend="Ingress (Traefik)",
|
||||||
legend_display="list",
|
legend_display="list",
|
||||||
legend_placement="bottom",
|
legend_placement="bottom",
|
||||||
links=link_to("atlas-network"),
|
links=link_to("atlas-network"),
|
||||||
@ -689,9 +729,22 @@ def build_overview():
|
|||||||
19,
|
19,
|
||||||
"Cluster egress throughput",
|
"Cluster egress throughput",
|
||||||
NET_EGRESS_EXPR,
|
NET_EGRESS_EXPR,
|
||||||
{"h": 7, "w": 12, "x": 12, "y": 34},
|
{"h": 7, "w": 8, "x": 8, "y": 40},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="Egress",
|
legend="Egress (Traefik)",
|
||||||
|
legend_display="list",
|
||||||
|
legend_placement="bottom",
|
||||||
|
links=link_to("atlas-network"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
panels.append(
|
||||||
|
timeseries_panel(
|
||||||
|
20,
|
||||||
|
"Intra-cluster throughput",
|
||||||
|
NET_INTERNAL_EXPR,
|
||||||
|
{"h": 7, "w": 8, "x": 16, "y": 40},
|
||||||
|
unit="Bps",
|
||||||
|
legend="Internal traffic",
|
||||||
legend_display="list",
|
legend_display="list",
|
||||||
legend_placement="bottom",
|
legend_placement="bottom",
|
||||||
links=link_to("atlas-network"),
|
links=link_to("atlas-network"),
|
||||||
@ -700,10 +753,10 @@ def build_overview():
|
|||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
20,
|
21,
|
||||||
"Root filesystem usage",
|
"Root filesystem usage",
|
||||||
root_usage_expr(),
|
root_usage_expr(),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 41},
|
{"h": 8, "w": 12, "x": 0, "y": 47},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -715,11 +768,11 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
{
|
{
|
||||||
"id": 21,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes closest to full root disks",
|
||||||
"datasource": PROM_DS,
|
"datasource": PROM_DS,
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
|
||||||
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
|
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
@ -744,28 +797,10 @@ def build_overview():
|
|||||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||||
},
|
},
|
||||||
"links": link_to("atlas-storage"),
|
"links": link_to("atlas-storage"),
|
||||||
|
"transformations": [{"id": "labelsToFields", "options": {}}],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
storage_panels = [
|
|
||||||
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
|
||||||
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
|
||||||
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
|
||||||
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
|
||||||
]
|
|
||||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
panel_id,
|
|
||||||
title,
|
|
||||||
expr,
|
|
||||||
{"h": 6, "w": 6, "x": 6 * idx, "y": 49},
|
|
||||||
unit=unit,
|
|
||||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
||||||
links=link_to("atlas-storage"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-overview",
|
"uid": "atlas-overview",
|
||||||
"title": "Atlas Overview",
|
"title": "Atlas Overview",
|
||||||
@ -1110,12 +1145,15 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
|
stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
|
||||||
)
|
)
|
||||||
|
panels.append(
|
||||||
|
stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
|
||||||
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
3,
|
4,
|
||||||
"Top router req/s",
|
"Top router req/s",
|
||||||
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
||||||
{"h": 4, "w": 8, "x": 16, "y": 0},
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
legend="{{router}}",
|
legend="{{router}}",
|
||||||
instant=True,
|
instant=True,
|
||||||
@ -1123,10 +1161,10 @@ def build_network_dashboard():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
4,
|
5,
|
||||||
"Per-node throughput",
|
"Per-node throughput",
|
||||||
node_net_expr(),
|
node_net_expr(),
|
||||||
{"h": 8, "w": 24, "x": 0, "y": 4},
|
{"h": 8, "w": 24, "x": 0, "y": 8},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -1135,32 +1173,32 @@ def build_network_dashboard():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
5,
|
6,
|
||||||
"Top namespaces",
|
"Top namespaces",
|
||||||
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
||||||
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 12},
|
{"h": 9, "w": 12, "x": 0, "y": 16},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
6,
|
7,
|
||||||
"Top pods",
|
"Top pods",
|
||||||
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
||||||
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 12},
|
{"h": 9, "w": 12, "x": 12, "y": 16},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
7,
|
8,
|
||||||
"Traefik routers (req/s)",
|
"Traefik routers (req/s)",
|
||||||
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 21},
|
{"h": 9, "w": 12, "x": 0, "y": 25},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
legend="{{router}}",
|
legend="{{router}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -1169,10 +1207,10 @@ def build_network_dashboard():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
8,
|
9,
|
||||||
"Traefik entrypoints (req/s)",
|
"Traefik entrypoints (req/s)",
|
||||||
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 21},
|
{"h": 9, "w": 12, "x": 12, "y": 25},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
legend="{{entrypoint}}",
|
legend="{{entrypoint}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -80,7 +80,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -127,7 +127,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Top router req/s",
|
"title": "Intra-cluster traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -138,6 +138,66 @@
|
|||||||
"x": 16,
|
"x": 16,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "rgba(115, 115, 115, 1)",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "Bps",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Top router req/s",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 8,
|
||||||
|
"x": 0,
|
||||||
|
"y": 4
|
||||||
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
||||||
@ -187,7 +247,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Per-node throughput",
|
"title": "Per-node throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -198,7 +258,7 @@
|
|||||||
"h": 8,
|
"h": 8,
|
||||||
"w": 24,
|
"w": 24,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 4
|
"y": 8
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -224,7 +284,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 6,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top namespaces",
|
"title": "Top namespaces",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -235,7 +295,7 @@
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 12
|
"y": 16
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -260,7 +320,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6,
|
"id": 7,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods",
|
"title": "Top pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -271,7 +331,7 @@
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 12
|
"y": 16
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -296,7 +356,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 7,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik routers (req/s)",
|
"title": "Traefik routers (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -307,7 +367,7 @@
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 21
|
"y": 25
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -333,7 +393,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 8,
|
"id": 9,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik entrypoints (req/s)",
|
"title": "Traefik entrypoints (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -344,7 +404,7 @@
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 21
|
"y": 25
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
74
services/monitoring/dcgm-exporter.yaml
Normal file
74
services/monitoring/dcgm-exporter.yaml
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
# services/monitoring/dcgm-exporter.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: dcgm-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: dcgm-exporter
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: dcgm-exporter
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: dcgm-exporter
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "9400"
|
||||||
|
spec:
|
||||||
|
serviceAccountName: default
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/hostname
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
|
containers:
|
||||||
|
- name: dcgm-exporter
|
||||||
|
image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 9400
|
||||||
|
env:
|
||||||
|
- name: DCGM_EXPORTER_KUBERNETES
|
||||||
|
value: "true"
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 64Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: pod-resources
|
||||||
|
mountPath: /var/lib/kubelet/pod-resources
|
||||||
|
volumes:
|
||||||
|
- name: pod-resources
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/pod-resources
|
||||||
|
type: Directory
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: dcgm-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: dcgm-exporter
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: dcgm-exporter
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
port: 9400
|
||||||
|
targetPort: metrics
|
||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -89,7 +89,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -136,7 +136,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Top router req/s",
|
"title": "Intra-cluster traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -147,6 +147,66 @@ data:
|
|||||||
"x": 16,
|
"x": 16,
|
||||||
"y": 0
|
"y": 0
|
||||||
},
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "rgba(115, 115, 115, 1)",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "Bps",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"textMode": "value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Top router req/s",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 4,
|
||||||
|
"w": 8,
|
||||||
|
"x": 0,
|
||||||
|
"y": 4
|
||||||
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
||||||
@ -196,7 +256,7 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Per-node throughput",
|
"title": "Per-node throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -207,7 +267,7 @@ data:
|
|||||||
"h": 8,
|
"h": 8,
|
||||||
"w": 24,
|
"w": 24,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 4
|
"y": 8
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -233,7 +293,7 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 6,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top namespaces",
|
"title": "Top namespaces",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -244,7 +304,7 @@ data:
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 12
|
"y": 16
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -269,7 +329,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6,
|
"id": 7,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods",
|
"title": "Top pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -280,7 +340,7 @@ data:
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 12
|
"y": 16
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -305,7 +365,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 7,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik routers (req/s)",
|
"title": "Traefik routers (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -316,7 +376,7 @@ data:
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 21
|
"y": 25
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
@ -342,7 +402,7 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 8,
|
"id": 9,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik entrypoints (req/s)",
|
"title": "Traefik entrypoints (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -353,7 +413,7 @@ data:
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 21
|
"y": 25
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -10,5 +10,6 @@ resources:
|
|||||||
- grafana-dashboard-nodes.yaml
|
- grafana-dashboard-nodes.yaml
|
||||||
- grafana-dashboard-storage.yaml
|
- grafana-dashboard-storage.yaml
|
||||||
- grafana-dashboard-network.yaml
|
- grafana-dashboard-network.yaml
|
||||||
|
- dcgm-exporter.yaml
|
||||||
- grafana-folders.yaml
|
- grafana-folders.yaml
|
||||||
- helmrelease.yaml
|
- helmrelease.yaml
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user