monitoring: refresh overview dashboards
This commit is contained in:
parent
8e6c0a3cfe
commit
ff056551c7
@ -165,22 +165,22 @@ def node_io_expr(scope=""):
|
||||
return scoped_node_expr(base, scope)
|
||||
|
||||
|
||||
def namespace_cpu_share_expr():
|
||||
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
|
||||
def namespace_share_expr(resource_expr):
|
||||
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
||||
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
|
||||
|
||||
def namespace_cpu_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_CPU_RAW)
|
||||
|
||||
|
||||
def namespace_ram_share_expr():
|
||||
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
return namespace_share_expr(NAMESPACE_RAM_RAW)
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = (
|
||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE = (
|
||||
'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
||||
NAMESPACE_GPU_RAW = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_USAGE
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_GPU_ALLOC
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
|
||||
NAMESPACE_COMBINED_FILTER = (
|
||||
'topk(10, ('
|
||||
NAMESPACE_GPU_WEIGHT = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_ALLOC
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ ") + ("
|
||||
+ NAMESPACE_RAM_RAW
|
||||
+ ' / 1e9) + ('
|
||||
+ NAMESPACE_GPU_WEIGHT
|
||||
+ " * 10))"
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_ACTIVITY_SCORE = (
|
||||
"( "
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " ) + ("
|
||||
+ NAMESPACE_RAM_RAW
|
||||
+ " / 1e9) + ("
|
||||
+ NAMESPACE_GPU_WEIGHT
|
||||
+ " * 100)"
|
||||
)
|
||||
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
NET_INGRESS_EXPR = (
|
||||
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||
TRAEFIK_NET_INGRESS = (
|
||||
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||
" or on() vector(0)"
|
||||
)
|
||||
NET_EGRESS_EXPR = (
|
||||
TRAEFIK_NET_EGRESS = (
|
||||
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||
" or on() vector(0)"
|
||||
)
|
||||
NET_TOTAL_EXPR = (
|
||||
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||
" or on() vector(0)"
|
||||
)
|
||||
NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
|
||||
NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
|
||||
NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Panel factories
|
||||
@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid):
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
||||
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"color": {"mode": "palette-classic"},
|
||||
},
|
||||
"overrides": [],
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "right"},
|
||||
"pieType": "pie",
|
||||
"displayLabels": ["percent"],
|
||||
"tooltip": {"mode": "single"},
|
||||
"colorScheme": "interpolateSpectral",
|
||||
"colorBy": "value",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
},
|
||||
}
|
||||
@ -511,7 +533,6 @@ def build_overview():
|
||||
1,
|
||||
link_to("atlas-pods"),
|
||||
),
|
||||
(6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
|
||||
]
|
||||
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
||||
thresholds = None
|
||||
@ -591,12 +612,31 @@ def build_overview():
|
||||
)
|
||||
)
|
||||
|
||||
storage_panels = [
|
||||
(23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
(24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||
(25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||
(26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||
]
|
||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||
panels.append(
|
||||
stat_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
pie_panel(
|
||||
11,
|
||||
"Namespace CPU share",
|
||||
namespace_cpu_share_expr(),
|
||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -604,7 +644,7 @@ def build_overview():
|
||||
12,
|
||||
"Namespace GPU share",
|
||||
namespace_gpu_share_expr(),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -612,7 +652,7 @@ def build_overview():
|
||||
13,
|
||||
"Namespace RAM share",
|
||||
namespace_ram_share_expr(),
|
||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||
)
|
||||
)
|
||||
|
||||
@ -622,7 +662,7 @@ def build_overview():
|
||||
14,
|
||||
"Worker node CPU",
|
||||
node_cpu_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
||||
{"h": 8, "w": 12, "x": 0, "y": 25},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -636,7 +676,7 @@ def build_overview():
|
||||
15,
|
||||
"Worker node RAM",
|
||||
node_mem_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
||||
{"h": 8, "w": 12, "x": 12, "y": 25},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -651,7 +691,7 @@ def build_overview():
|
||||
16,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 0, "y": 27},
|
||||
{"h": 7, "w": 12, "x": 0, "y": 33},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -663,7 +703,7 @@ def build_overview():
|
||||
17,
|
||||
"Control plane RAM",
|
||||
node_mem_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 12, "y": 27},
|
||||
{"h": 7, "w": 12, "x": 12, "y": 33},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -676,9 +716,9 @@ def build_overview():
|
||||
18,
|
||||
"Cluster ingress throughput",
|
||||
NET_INGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 0, "y": 34},
|
||||
{"h": 7, "w": 8, "x": 0, "y": 40},
|
||||
unit="Bps",
|
||||
legend="Ingress",
|
||||
legend="Ingress (Traefik)",
|
||||
legend_display="list",
|
||||
legend_placement="bottom",
|
||||
links=link_to("atlas-network"),
|
||||
@ -689,9 +729,22 @@ def build_overview():
|
||||
19,
|
||||
"Cluster egress throughput",
|
||||
NET_EGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 12, "y": 34},
|
||||
{"h": 7, "w": 8, "x": 8, "y": 40},
|
||||
unit="Bps",
|
||||
legend="Egress",
|
||||
legend="Egress (Traefik)",
|
||||
legend_display="list",
|
||||
legend_placement="bottom",
|
||||
links=link_to("atlas-network"),
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
20,
|
||||
"Intra-cluster throughput",
|
||||
NET_INTERNAL_EXPR,
|
||||
{"h": 7, "w": 8, "x": 16, "y": 40},
|
||||
unit="Bps",
|
||||
legend="Internal traffic",
|
||||
legend_display="list",
|
||||
legend_placement="bottom",
|
||||
links=link_to("atlas-network"),
|
||||
@ -700,10 +753,10 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
20,
|
||||
21,
|
||||
"Root filesystem usage",
|
||||
root_usage_expr(),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 41},
|
||||
{"h": 8, "w": 12, "x": 0, "y": 47},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -715,11 +768,11 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 21,
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
|
||||
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
@ -744,28 +797,10 @@ def build_overview():
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
},
|
||||
"links": link_to("atlas-storage"),
|
||||
"transformations": [{"id": "labelsToFields", "options": {}}],
|
||||
}
|
||||
)
|
||||
|
||||
storage_panels = [
|
||||
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||
]
|
||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||
panels.append(
|
||||
stat_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 6, "w": 6, "x": 6 * idx, "y": 49},
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-overview",
|
||||
"title": "Atlas Overview",
|
||||
@ -1110,12 +1145,15 @@ def build_network_dashboard():
|
||||
panels.append(
|
||||
stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
3,
|
||||
4,
|
||||
"Top router req/s",
|
||||
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
||||
{"h": 4, "w": 8, "x": 16, "y": 0},
|
||||
{"h": 4, "w": 8, "x": 0, "y": 4},
|
||||
unit="req/s",
|
||||
legend="{{router}}",
|
||||
instant=True,
|
||||
@ -1123,10 +1161,10 @@ def build_network_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
4,
|
||||
5,
|
||||
"Per-node throughput",
|
||||
node_net_expr(),
|
||||
{"h": 8, "w": 24, "x": 0, "y": 4},
|
||||
{"h": 8, "w": 24, "x": 0, "y": 8},
|
||||
unit="Bps",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -1135,32 +1173,32 @@ def build_network_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
5,
|
||||
6,
|
||||
"Top namespaces",
|
||||
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
||||
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
||||
{"h": 9, "w": 12, "x": 0, "y": 12},
|
||||
{"h": 9, "w": 12, "x": 0, "y": 16},
|
||||
unit="Bps",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
6,
|
||||
7,
|
||||
"Top pods",
|
||||
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
||||
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
||||
{"h": 9, "w": 12, "x": 12, "y": 12},
|
||||
{"h": 9, "w": 12, "x": 12, "y": 16},
|
||||
unit="Bps",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
7,
|
||||
8,
|
||||
"Traefik routers (req/s)",
|
||||
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
||||
{"h": 9, "w": 12, "x": 0, "y": 21},
|
||||
{"h": 9, "w": 12, "x": 0, "y": 25},
|
||||
unit="req/s",
|
||||
legend="{{router}}",
|
||||
legend_display="table",
|
||||
@ -1169,10 +1207,10 @@ def build_network_dashboard():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
8,
|
||||
9,
|
||||
"Traefik entrypoints (req/s)",
|
||||
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
||||
{"h": 9, "w": 12, "x": 12, "y": 21},
|
||||
{"h": 9, "w": 12, "x": 12, "y": 25},
|
||||
unit="req/s",
|
||||
legend="{{entrypoint}}",
|
||||
legend_display="table",
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -80,7 +80,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -127,7 +127,7 @@
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Top router req/s",
|
||||
"title": "Intra-cluster traffic",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -138,6 +138,66 @@
|
||||
"x": 16,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "Bps",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Top router req/s",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
||||
@ -187,7 +247,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Per-node throughput",
|
||||
"datasource": {
|
||||
@ -198,7 +258,7 @@
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -224,7 +284,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"id": 6,
|
||||
"type": "table",
|
||||
"title": "Top namespaces",
|
||||
"datasource": {
|
||||
@ -235,7 +295,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -260,7 +320,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"id": 7,
|
||||
"type": "table",
|
||||
"title": "Top pods",
|
||||
"datasource": {
|
||||
@ -271,7 +331,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -296,7 +356,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"id": 8,
|
||||
"type": "timeseries",
|
||||
"title": "Traefik routers (req/s)",
|
||||
"datasource": {
|
||||
@ -307,7 +367,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
"y": 25
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -333,7 +393,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"id": 9,
|
||||
"type": "timeseries",
|
||||
"title": "Traefik entrypoints (req/s)",
|
||||
"datasource": {
|
||||
@ -344,7 +404,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
"y": 25
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
74
services/monitoring/dcgm-exporter.yaml
Normal file
74
services/monitoring/dcgm-exporter.yaml
Normal file
@ -0,0 +1,74 @@
|
||||
# services/monitoring/dcgm-exporter.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: dcgm-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: dcgm-exporter
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: dcgm-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: dcgm-exporter
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9400"
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
containers:
|
||||
- name: dcgm-exporter
|
||||
image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9400
|
||||
env:
|
||||
- name: DCGM_EXPORTER_KUBERNETES
|
||||
value: "true"
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
volumeMounts:
|
||||
- name: pod-resources
|
||||
mountPath: /var/lib/kubelet/pod-resources
|
||||
volumes:
|
||||
- name: pod-resources
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/pod-resources
|
||||
type: Directory
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: dcgm-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: dcgm-exporter
|
||||
spec:
|
||||
selector:
|
||||
app: dcgm-exporter
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9400
|
||||
targetPort: metrics
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -89,7 +89,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -136,7 +136,7 @@ data:
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Top router req/s",
|
||||
"title": "Intra-cluster traffic",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -147,6 +147,66 @@ data:
|
||||
"x": 16,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "Bps",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Top router req/s",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
|
||||
@ -196,7 +256,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Per-node throughput",
|
||||
"datasource": {
|
||||
@ -207,7 +267,7 @@ data:
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -233,7 +293,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"id": 6,
|
||||
"type": "table",
|
||||
"title": "Top namespaces",
|
||||
"datasource": {
|
||||
@ -244,7 +304,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -269,7 +329,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"id": 7,
|
||||
"type": "table",
|
||||
"title": "Top pods",
|
||||
"datasource": {
|
||||
@ -280,7 +340,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -305,7 +365,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"id": 8,
|
||||
"type": "timeseries",
|
||||
"title": "Traefik routers (req/s)",
|
||||
"datasource": {
|
||||
@ -316,7 +376,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
"y": 25
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -342,7 +402,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"id": 9,
|
||||
"type": "timeseries",
|
||||
"title": "Traefik entrypoints (req/s)",
|
||||
"datasource": {
|
||||
@ -353,7 +413,7 @@ data:
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
"y": 25
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -10,5 +10,6 @@ resources:
|
||||
- grafana-dashboard-nodes.yaml
|
||||
- grafana-dashboard-storage.yaml
|
||||
- grafana-dashboard-network.yaml
|
||||
- dcgm-exporter.yaml
|
||||
- grafana-folders.yaml
|
||||
- helmrelease.yaml
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user