monitoring: polish dashboards and folders
This commit is contained in:
parent
eed67b3db0
commit
a3dc9391ee
@ -32,7 +32,7 @@ data:
|
|||||||
)
|
)
|
||||||
|
|
||||||
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
||||||
PUBLIC_FOLDER = "atlas-overview"
|
PUBLIC_FOLDER = "overview"
|
||||||
PRIVATE_FOLDER = "atlas-internal"
|
PRIVATE_FOLDER = "atlas-internal"
|
||||||
|
|
||||||
PERCENT_THRESHOLDS = {
|
PERCENT_THRESHOLDS = {
|
||||||
@ -231,10 +231,13 @@ NAMESPACE_GPU_ALLOC = (
|
|||||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
||||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
||||||
)
|
)
|
||||||
NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
NAMESPACE_GPU_USAGE_SHARE = (
|
||||||
|
'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)'
|
||||||
|
)
|
||||||
|
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
||||||
NAMESPACE_GPU_RAW = (
|
NAMESPACE_GPU_RAW = (
|
||||||
"("
|
"("
|
||||||
+ NAMESPACE_GPU_USAGE
|
+ NAMESPACE_GPU_USAGE_SHARE
|
||||||
+ ") or on(namespace) ("
|
+ ") or on(namespace) ("
|
||||||
+ NAMESPACE_CPU_RAW
|
+ NAMESPACE_CPU_RAW
|
||||||
+ " * 0)"
|
+ " * 0)"
|
||||||
@ -519,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
|
|||||||
"orientation": "horizontal",
|
"orientation": "horizontal",
|
||||||
"reduceOptions": {
|
"reduceOptions": {
|
||||||
"calcs": ["lastNotNull"],
|
"calcs": ["lastNotNull"],
|
||||||
"fields": "/.*/",
|
"fields": "Value",
|
||||||
"values": False,
|
"values": False,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -555,7 +558,7 @@ def build_overview():
|
|||||||
row1_stats = [
|
row1_stats = [
|
||||||
(
|
(
|
||||||
1,
|
1,
|
||||||
"Workers ready",
|
"Workers Ready",
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
||||||
WORKER_SUFFIX,
|
WORKER_SUFFIX,
|
||||||
WORKER_TOTAL,
|
WORKER_TOTAL,
|
||||||
@ -563,7 +566,7 @@ def build_overview():
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
2,
|
2,
|
||||||
"Control plane ready",
|
"Control Plane Ready",
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
||||||
CONTROL_SUFFIX,
|
CONTROL_SUFFIX,
|
||||||
CONTROL_TOTAL,
|
CONTROL_TOTAL,
|
||||||
@ -571,7 +574,7 @@ def build_overview():
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
3,
|
3,
|
||||||
"Control plane workloads",
|
"Control Plane Workloads",
|
||||||
CONTROL_WORKLOADS_EXPR,
|
CONTROL_WORKLOADS_EXPR,
|
||||||
None,
|
None,
|
||||||
4,
|
4,
|
||||||
@ -579,7 +582,7 @@ def build_overview():
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
4,
|
4,
|
||||||
"Problem pods",
|
"Problem Pods",
|
||||||
PROBLEM_PODS_EXPR,
|
PROBLEM_PODS_EXPR,
|
||||||
None,
|
None,
|
||||||
1,
|
1,
|
||||||
@ -587,7 +590,7 @@ def build_overview():
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
5,
|
5,
|
||||||
"Stuck terminating",
|
"Stuck Terminating",
|
||||||
STUCK_TERMINATING_EXPR,
|
STUCK_TERMINATING_EXPR,
|
||||||
None,
|
None,
|
||||||
1,
|
1,
|
||||||
@ -644,7 +647,7 @@ def build_overview():
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
width, x = gauge_grid(idx)
|
width, x = gauge_grid(idx)
|
||||||
if panel_id == 3:
|
if panel_id in (3, 4, 5):
|
||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
panel_id,
|
panel_id,
|
||||||
@ -654,6 +657,7 @@ def build_overview():
|
|||||||
thresholds=thresholds,
|
thresholds=thresholds,
|
||||||
legend=None,
|
legend=None,
|
||||||
links=links,
|
links=links,
|
||||||
|
text_mode="value",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -693,10 +697,10 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
|
|
||||||
storage_panels = [
|
storage_panels = [
|
||||||
(23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||||
(24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||||
(25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||||
(26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||||
]
|
]
|
||||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -714,7 +718,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
11,
|
11,
|
||||||
"Namespace CPU share",
|
"Namespace CPU Share",
|
||||||
namespace_cpu_share_expr(),
|
namespace_cpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 0, "y": 16},
|
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||||
)
|
)
|
||||||
@ -722,7 +726,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
12,
|
12,
|
||||||
"Namespace GPU share",
|
"Namespace GPU Share",
|
||||||
namespace_gpu_share_expr(),
|
namespace_gpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 16},
|
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||||
)
|
)
|
||||||
@ -730,7 +734,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
13,
|
13,
|
||||||
"Namespace RAM share",
|
"Namespace RAM Share",
|
||||||
namespace_ram_share_expr(),
|
namespace_ram_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 16, "y": 16},
|
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||||
)
|
)
|
||||||
@ -740,7 +744,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
14,
|
14,
|
||||||
"Worker node CPU",
|
"Worker Node CPU",
|
||||||
node_cpu_expr(worker_filter),
|
node_cpu_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 32},
|
{"h": 8, "w": 12, "x": 0, "y": 32},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -754,7 +758,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
15,
|
15,
|
||||||
"Worker node RAM",
|
"Worker Node RAM",
|
||||||
node_mem_expr(worker_filter),
|
node_mem_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 32},
|
{"h": 8, "w": 12, "x": 12, "y": 32},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -794,7 +798,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
18,
|
18,
|
||||||
"Cluster ingress throughput",
|
"Cluster Ingress Throughput",
|
||||||
NET_INGRESS_EXPR,
|
NET_INGRESS_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 0, "y": 25},
|
{"h": 7, "w": 8, "x": 0, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -807,7 +811,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
19,
|
19,
|
||||||
"Cluster egress throughput",
|
"Cluster Egress Throughput",
|
||||||
NET_EGRESS_EXPR,
|
NET_EGRESS_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 8, "y": 25},
|
{"h": 7, "w": 8, "x": 8, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -820,7 +824,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
20,
|
20,
|
||||||
"Intra-cluster throughput",
|
"Intra-Cluster Throughput",
|
||||||
NET_INTERNAL_EXPR,
|
NET_INTERNAL_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 16, "y": 25},
|
{"h": 7, "w": 8, "x": 16, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -834,7 +838,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
21,
|
21,
|
||||||
"Root filesystem usage",
|
"Root Filesystem Usage",
|
||||||
root_usage_expr(),
|
root_usage_expr(),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 47},
|
{"h": 8, "w": 12, "x": 0, "y": 47},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -849,7 +853,7 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
bargauge_panel(
|
bargauge_panel(
|
||||||
22,
|
22,
|
||||||
"Nodes closest to full root disks",
|
"Nodes Closest to Full Root Disks",
|
||||||
f"topk(8, {root_usage_expr()})",
|
f"topk(8, {root_usage_expr()})",
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 47},
|
{"h": 8, "w": 12, "x": 12, "y": 47},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -868,7 +872,8 @@ def build_overview():
|
|||||||
"style": "dark",
|
"style": "dark",
|
||||||
"tags": ["atlas", "overview"],
|
"tags": ["atlas", "overview"],
|
||||||
"templating": {"list": []},
|
"templating": {"list": []},
|
||||||
"time": {"from": "now-12h", "to": "now"},
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
|
"refresh": "1m",
|
||||||
"links": [
|
"links": [
|
||||||
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
|
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
|
||||||
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
|
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
|
||||||
@ -884,7 +889,7 @@ def build_pods_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
1,
|
1,
|
||||||
"Problem pods",
|
"Problem Pods",
|
||||||
PROBLEM_PODS_EXPR,
|
PROBLEM_PODS_EXPR,
|
||||||
{"h": 4, "w": 6, "x": 0, "y": 0},
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
||||||
thresholds={
|
thresholds={
|
||||||
@ -914,7 +919,7 @@ def build_pods_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
3,
|
3,
|
||||||
"Stuck terminating (>10m)",
|
"Stuck Terminating (>10m)",
|
||||||
STUCK_TERMINATING_EXPR,
|
STUCK_TERMINATING_EXPR,
|
||||||
{"h": 4, "w": 6, "x": 12, "y": 0},
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
||||||
thresholds={
|
thresholds={
|
||||||
@ -929,7 +934,7 @@ def build_pods_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
4,
|
4,
|
||||||
"Control plane workloads",
|
"Control Plane Workloads",
|
||||||
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
||||||
{"h": 4, "w": 6, "x": 18, "y": 0},
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
||||||
thresholds={
|
thresholds={
|
||||||
@ -945,7 +950,7 @@ def build_pods_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
5,
|
5,
|
||||||
"Pods not running",
|
"Pods Not Running",
|
||||||
PROBLEM_TABLE_EXPR,
|
PROBLEM_TABLE_EXPR,
|
||||||
{"h": 10, "w": 24, "x": 0, "y": 4},
|
{"h": 10, "w": 24, "x": 0, "y": 4},
|
||||||
unit="s",
|
unit="s",
|
||||||
@ -994,7 +999,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
1,
|
1,
|
||||||
"Worker nodes ready",
|
"Worker Nodes Ready",
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
||||||
{"h": 4, "w": 8, "x": 0, "y": 0},
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
||||||
value_suffix=WORKER_SUFFIX,
|
value_suffix=WORKER_SUFFIX,
|
||||||
@ -1003,7 +1008,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
2,
|
2,
|
||||||
"Control plane ready",
|
"Control Plane Ready",
|
||||||
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
||||||
{"h": 4, "w": 8, "x": 8, "y": 0},
|
{"h": 4, "w": 8, "x": 8, "y": 0},
|
||||||
value_suffix=CONTROL_SUFFIX,
|
value_suffix=CONTROL_SUFFIX,
|
||||||
@ -1012,7 +1017,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
3,
|
3,
|
||||||
"Control plane workloads",
|
"Control Plane Workloads",
|
||||||
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
||||||
{"h": 4, "w": 8, "x": 16, "y": 0},
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
||||||
)
|
)
|
||||||
@ -1046,7 +1051,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
6,
|
6,
|
||||||
"Control plane (incl. titan-db) CPU",
|
"Control Plane (incl. titan-db) CPU",
|
||||||
node_cpu_expr(CONTROL_ALL_REGEX),
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 22},
|
{"h": 9, "w": 12, "x": 0, "y": 22},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1058,7 +1063,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
7,
|
7,
|
||||||
"Control plane (incl. titan-db) RAM",
|
"Control Plane (incl. titan-db) RAM",
|
||||||
node_mem_expr(CONTROL_ALL_REGEX),
|
node_mem_expr(CONTROL_ALL_REGEX),
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 22},
|
{"h": 9, "w": 12, "x": 12, "y": 22},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1070,7 +1075,7 @@ def build_nodes_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
8,
|
8,
|
||||||
"Root filesystem usage",
|
"Root Filesystem Usage",
|
||||||
root_usage_expr(),
|
root_usage_expr(),
|
||||||
{"h": 9, "w": 24, "x": 0, "y": 31},
|
{"h": 9, "w": 24, "x": 0, "y": 31},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1099,7 +1104,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
1,
|
1,
|
||||||
"Astreae usage",
|
"Astreae Usage",
|
||||||
astreae_usage_expr("/mnt/astreae"),
|
astreae_usage_expr("/mnt/astreae"),
|
||||||
{"h": 5, "w": 6, "x": 0, "y": 0},
|
{"h": 5, "w": 6, "x": 0, "y": 0},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1109,7 +1114,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
2,
|
2,
|
||||||
"Asteria usage",
|
"Asteria Usage",
|
||||||
astreae_usage_expr("/mnt/asteria"),
|
astreae_usage_expr("/mnt/asteria"),
|
||||||
{"h": 5, "w": 6, "x": 6, "y": 0},
|
{"h": 5, "w": 6, "x": 6, "y": 0},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1119,7 +1124,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
3,
|
3,
|
||||||
"Astreae free",
|
"Astreae Free",
|
||||||
astreae_free_expr("/mnt/astreae"),
|
astreae_free_expr("/mnt/astreae"),
|
||||||
{"h": 5, "w": 6, "x": 12, "y": 0},
|
{"h": 5, "w": 6, "x": 12, "y": 0},
|
||||||
unit="decbytes",
|
unit="decbytes",
|
||||||
@ -1128,7 +1133,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
4,
|
4,
|
||||||
"Asteria free",
|
"Asteria Free",
|
||||||
astreae_free_expr("/mnt/asteria"),
|
astreae_free_expr("/mnt/asteria"),
|
||||||
{"h": 5, "w": 6, "x": 18, "y": 0},
|
{"h": 5, "w": 6, "x": 18, "y": 0},
|
||||||
unit="decbytes",
|
unit="decbytes",
|
||||||
@ -1137,7 +1142,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
5,
|
5,
|
||||||
"Astreae per-node usage",
|
"Astreae Per-Node Usage",
|
||||||
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
|
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 5},
|
{"h": 9, "w": 12, "x": 0, "y": 5},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1150,7 +1155,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
6,
|
6,
|
||||||
"Asteria per-node usage",
|
"Asteria Per-Node Usage",
|
||||||
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
|
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 5},
|
{"h": 9, "w": 12, "x": 12, "y": 5},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1163,7 +1168,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
7,
|
7,
|
||||||
"Astreae usage history",
|
"Astreae Usage History",
|
||||||
astreae_usage_expr("/mnt/astreae"),
|
astreae_usage_expr("/mnt/astreae"),
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 14},
|
{"h": 9, "w": 12, "x": 0, "y": 14},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1173,7 +1178,7 @@ def build_storage_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
8,
|
8,
|
||||||
"Asteria usage history",
|
"Asteria Usage History",
|
||||||
astreae_usage_expr("/mnt/asteria"),
|
astreae_usage_expr("/mnt/asteria"),
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 14},
|
{"h": 9, "w": 12, "x": 12, "y": 14},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1199,7 +1204,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
1,
|
1,
|
||||||
"Ingress traffic",
|
"Ingress Traffic",
|
||||||
NET_INGRESS_EXPR,
|
NET_INGRESS_EXPR,
|
||||||
{"h": 4, "w": 8, "x": 0, "y": 0},
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -1208,7 +1213,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
2,
|
2,
|
||||||
"Egress traffic",
|
"Egress Traffic",
|
||||||
NET_EGRESS_EXPR,
|
NET_EGRESS_EXPR,
|
||||||
{"h": 4, "w": 8, "x": 8, "y": 0},
|
{"h": 4, "w": 8, "x": 8, "y": 0},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -1217,7 +1222,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
3,
|
3,
|
||||||
"Intra-cluster traffic",
|
"Intra-Cluster Traffic",
|
||||||
NET_INTERNAL_EXPR,
|
NET_INTERNAL_EXPR,
|
||||||
{"h": 4, "w": 8, "x": 16, "y": 0},
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -1226,7 +1231,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
stat_panel(
|
stat_panel(
|
||||||
4,
|
4,
|
||||||
"Top router req/s",
|
"Top Router req/s",
|
||||||
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
||||||
{"h": 4, "w": 8, "x": 0, "y": 4},
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
@ -1236,7 +1241,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
5,
|
5,
|
||||||
"Per-node throughput",
|
"Per-Node Throughput",
|
||||||
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
|
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
|
||||||
{"h": 8, "w": 24, "x": 0, "y": 8},
|
{"h": 8, "w": 24, "x": 0, "y": 8},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
@ -1248,7 +1253,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
6,
|
6,
|
||||||
"Top namespaces",
|
"Top Namespaces",
|
||||||
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
||||||
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 16},
|
{"h": 9, "w": 12, "x": 0, "y": 16},
|
||||||
@ -1259,7 +1264,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
7,
|
7,
|
||||||
"Top pods",
|
"Top Pods",
|
||||||
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
||||||
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 16},
|
{"h": 9, "w": 12, "x": 12, "y": 16},
|
||||||
@ -1270,7 +1275,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
8,
|
8,
|
||||||
"Traefik routers (req/s)",
|
"Traefik Routers (req/s)",
|
||||||
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
||||||
{"h": 9, "w": 12, "x": 0, "y": 25},
|
{"h": 9, "w": 12, "x": 0, "y": 25},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
@ -1282,7 +1287,7 @@ def build_network_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
9,
|
9,
|
||||||
"Traefik entrypoints (req/s)",
|
"Traefik Entrypoints (req/s)",
|
||||||
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
||||||
{"h": 9, "w": 12, "x": 12, "y": 25},
|
{"h": 9, "w": 12, "x": 12, "y": 25},
|
||||||
unit="req/s",
|
unit="req/s",
|
||||||
@ -1310,7 +1315,7 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
1,
|
1,
|
||||||
"Namespace GPU share",
|
"Namespace GPU Share",
|
||||||
namespace_gpu_share_expr(),
|
namespace_gpu_share_expr(),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
)
|
)
|
||||||
@ -1318,8 +1323,8 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
2,
|
2,
|
||||||
"GPU util by namespace",
|
"GPU Util by Namespace",
|
||||||
NAMESPACE_GPU_USAGE,
|
NAMESPACE_GPU_USAGE_INSTANT,
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 0},
|
{"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{namespace}}",
|
legend="{{namespace}}",
|
||||||
@ -1330,7 +1335,7 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
3,
|
3,
|
||||||
"GPU util by node",
|
"GPU Util by Node",
|
||||||
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
@ -1342,7 +1347,7 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
4,
|
4,
|
||||||
"Top pods by GPU util",
|
"Top Pods by GPU Util",
|
||||||
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 8},
|
{"h": 8, "w": 12, "x": 12, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace GPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU util by namespace",
|
"title": "GPU Util by Namespace",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -97,7 +97,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU util by node",
|
"title": "GPU Util by Node",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -134,7 +134,7 @@
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods by GPU util",
|
"title": "Top Pods by GPU Util",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Ingress traffic",
|
"title": "Ingress Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -67,7 +67,7 @@
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Egress traffic",
|
"title": "Egress Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -127,7 +127,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Intra-cluster traffic",
|
"title": "Intra-Cluster Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -187,7 +187,7 @@
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Top router req/s",
|
"title": "Top Router req/s",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -248,7 +248,7 @@
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Per-node throughput",
|
"title": "Per-Node Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -285,7 +285,7 @@
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top namespaces",
|
"title": "Top Namespaces",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -321,7 +321,7 @@
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods",
|
"title": "Top Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -357,7 +357,7 @@
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik routers (req/s)",
|
"title": "Traefik Routers (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -394,7 +394,7 @@
|
|||||||
{
|
{
|
||||||
"id": 9,
|
"id": 9,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik entrypoints (req/s)",
|
"title": "Traefik Entrypoints (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Worker nodes ready",
|
"title": "Worker Nodes Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -68,7 +68,7 @@
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane ready",
|
"title": "Control Plane Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -129,7 +129,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -269,7 +269,7 @@
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane (incl. titan-db) CPU",
|
"title": "Control Plane (incl. titan-db) CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -306,7 +306,7 @@
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane (incl. titan-db) RAM",
|
"title": "Control Plane (incl. titan-db) RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -343,7 +343,7 @@
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root Filesystem Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"uid": "atlas-overview",
|
"uid": "atlas-overview",
|
||||||
"title": "Atlas Overview",
|
"title": "Atlas Overview",
|
||||||
"folderUid": "atlas-overview",
|
"folderUid": "overview",
|
||||||
"editable": false,
|
"editable": false,
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": []
|
"list": []
|
||||||
@ -10,7 +10,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "gauge",
|
"type": "gauge",
|
||||||
"title": "Workers ready",
|
"title": "Workers Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -71,7 +71,7 @@
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "gauge",
|
"type": "gauge",
|
||||||
"title": "Control plane ready",
|
"title": "Control Plane Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -124,7 +124,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -198,8 +198,8 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "gauge",
|
"type": "stat",
|
||||||
"title": "Problem pods",
|
"title": "Problem Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -218,8 +218,10 @@
|
|||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"min": 0,
|
"color": {
|
||||||
"max": 4,
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
@ -240,11 +242,18 @@
|
|||||||
"value": 3
|
"value": 3
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
"reduceOptions": {
|
"reduceOptions": {
|
||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
@ -252,9 +261,7 @@
|
|||||||
"fields": "",
|
"fields": "",
|
||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"orientation": "auto",
|
"textMode": "value"
|
||||||
"showThresholdMarkers": false,
|
|
||||||
"showThresholdLabels": false
|
|
||||||
},
|
},
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@ -266,8 +273,8 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "gauge",
|
"type": "stat",
|
||||||
"title": "Stuck terminating",
|
"title": "Stuck Terminating",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -286,8 +293,10 @@
|
|||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"min": 0,
|
"color": {
|
||||||
"max": 4,
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
@ -308,11 +317,18 @@
|
|||||||
"value": 3
|
"value": 3
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
"reduceOptions": {
|
"reduceOptions": {
|
||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
@ -320,9 +336,7 @@
|
|||||||
"fields": "",
|
"fields": "",
|
||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"orientation": "auto",
|
"textMode": "value"
|
||||||
"showThresholdMarkers": false,
|
|
||||||
"showThresholdLabels": false
|
|
||||||
},
|
},
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@ -619,7 +633,7 @@
|
|||||||
{
|
{
|
||||||
"id": 23,
|
"id": 23,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -690,7 +704,7 @@
|
|||||||
{
|
{
|
||||||
"id": 24,
|
"id": 24,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -761,7 +775,7 @@
|
|||||||
{
|
{
|
||||||
"id": 25,
|
"id": 25,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -828,7 +842,7 @@
|
|||||||
{
|
{
|
||||||
"id": 26,
|
"id": 26,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -895,7 +909,7 @@
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace CPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -948,7 +962,7 @@
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace GPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -961,7 +975,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1001,7 +1015,7 @@
|
|||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace RAM share",
|
"title": "Namespace RAM Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1054,7 +1068,7 @@
|
|||||||
{
|
{
|
||||||
"id": 14,
|
"id": 14,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node CPU",
|
"title": "Worker Node CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1101,7 +1115,7 @@
|
|||||||
{
|
{
|
||||||
"id": 15,
|
"id": 15,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node RAM",
|
"title": "Worker Node RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1222,7 +1236,7 @@
|
|||||||
{
|
{
|
||||||
"id": 18,
|
"id": 18,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster ingress throughput",
|
"title": "Cluster Ingress Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1266,7 +1280,7 @@
|
|||||||
{
|
{
|
||||||
"id": 19,
|
"id": 19,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster egress throughput",
|
"title": "Cluster Egress Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1310,7 +1324,7 @@
|
|||||||
{
|
{
|
||||||
"id": 20,
|
"id": 20,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Intra-cluster throughput",
|
"title": "Intra-Cluster Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1354,7 +1368,7 @@
|
|||||||
{
|
{
|
||||||
"id": 21,
|
"id": 21,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root Filesystem Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1402,7 +1416,7 @@
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes Closest to Full Root Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1456,7 +1470,7 @@
|
|||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
],
|
],
|
||||||
"fields": "/.*/",
|
"fields": "Value",
|
||||||
"values": false
|
"values": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -1479,9 +1493,10 @@
|
|||||||
"list": []
|
"list": []
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
"from": "now-12h",
|
"from": "now-1h",
|
||||||
"to": "now"
|
"to": "now"
|
||||||
},
|
},
|
||||||
|
"refresh": "1m",
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Atlas Pods",
|
"title": "Atlas Pods",
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Problem pods",
|
"title": "Problem Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -127,7 +127,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Stuck terminating (>10m)",
|
"title": "Stuck Terminating (>10m)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -187,7 +187,7 @@
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -247,7 +247,7 @@
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Pods not running",
|
"title": "Pods Not Running",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -71,7 +71,7 @@
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -135,7 +135,7 @@
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -195,7 +195,7 @@
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -255,7 +255,7 @@
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Astreae per-node usage",
|
"title": "Astreae Per-Node Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -293,7 +293,7 @@
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Asteria per-node usage",
|
"title": "Asteria Per-Node Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -331,7 +331,7 @@
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Astreae usage history",
|
"title": "Astreae Usage History",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -368,7 +368,7 @@
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Asteria usage history",
|
"title": "Asteria Usage History",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace GPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -69,7 +69,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU util by namespace",
|
"title": "GPU Util by Namespace",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -106,7 +106,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU util by node",
|
"title": "GPU Util by Node",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -143,7 +143,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods by GPU util",
|
"title": "Top Pods by GPU Util",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Ingress traffic",
|
"title": "Ingress Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -76,7 +76,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Egress traffic",
|
"title": "Egress Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -136,7 +136,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Intra-cluster traffic",
|
"title": "Intra-Cluster Traffic",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -196,7 +196,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Top router req/s",
|
"title": "Top Router req/s",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -257,7 +257,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Per-node throughput",
|
"title": "Per-Node Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -294,7 +294,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top namespaces",
|
"title": "Top Namespaces",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -330,7 +330,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top pods",
|
"title": "Top Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -366,7 +366,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik routers (req/s)",
|
"title": "Traefik Routers (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -403,7 +403,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 9,
|
"id": 9,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Traefik entrypoints (req/s)",
|
"title": "Traefik Entrypoints (req/s)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Worker nodes ready",
|
"title": "Worker Nodes Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -77,7 +77,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane ready",
|
"title": "Control Plane Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -138,7 +138,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -278,7 +278,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane (incl. titan-db) CPU",
|
"title": "Control Plane (incl. titan-db) CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -315,7 +315,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane (incl. titan-db) RAM",
|
"title": "Control Plane (incl. titan-db) RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -352,7 +352,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root Filesystem Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -10,7 +10,7 @@ data:
|
|||||||
{
|
{
|
||||||
"uid": "atlas-overview",
|
"uid": "atlas-overview",
|
||||||
"title": "Atlas Overview",
|
"title": "Atlas Overview",
|
||||||
"folderUid": "atlas-overview",
|
"folderUid": "overview",
|
||||||
"editable": false,
|
"editable": false,
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": []
|
"list": []
|
||||||
@ -19,7 +19,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "gauge",
|
"type": "gauge",
|
||||||
"title": "Workers ready",
|
"title": "Workers Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -80,7 +80,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "gauge",
|
"type": "gauge",
|
||||||
"title": "Control plane ready",
|
"title": "Control Plane Ready",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -133,7 +133,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -207,8 +207,8 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "gauge",
|
"type": "stat",
|
||||||
"title": "Problem pods",
|
"title": "Problem Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -227,8 +227,10 @@ data:
|
|||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"min": 0,
|
"color": {
|
||||||
"max": 4,
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
@ -249,11 +251,18 @@ data:
|
|||||||
"value": 3
|
"value": 3
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
"reduceOptions": {
|
"reduceOptions": {
|
||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
@ -261,9 +270,7 @@ data:
|
|||||||
"fields": "",
|
"fields": "",
|
||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"orientation": "auto",
|
"textMode": "value"
|
||||||
"showThresholdMarkers": false,
|
|
||||||
"showThresholdLabels": false
|
|
||||||
},
|
},
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@ -275,8 +282,8 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "gauge",
|
"type": "stat",
|
||||||
"title": "Stuck terminating",
|
"title": "Stuck Terminating",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -295,8 +302,10 @@ data:
|
|||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"min": 0,
|
"color": {
|
||||||
"max": 4,
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
@ -317,11 +326,18 @@ data:
|
|||||||
"value": 3
|
"value": 3
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"unit": "none",
|
||||||
|
"custom": {
|
||||||
|
"displayMode": "auto"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "center",
|
||||||
"reduceOptions": {
|
"reduceOptions": {
|
||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
@ -329,9 +345,7 @@ data:
|
|||||||
"fields": "",
|
"fields": "",
|
||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"orientation": "auto",
|
"textMode": "value"
|
||||||
"showThresholdMarkers": false,
|
|
||||||
"showThresholdLabels": false
|
|
||||||
},
|
},
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@ -628,7 +642,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 23,
|
"id": 23,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -699,7 +713,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 24,
|
"id": 24,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -770,7 +784,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 25,
|
"id": 25,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -837,7 +851,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 26,
|
"id": 26,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -904,7 +918,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace CPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -957,7 +971,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace GPU Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -970,7 +984,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1010,7 +1024,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace RAM share",
|
"title": "Namespace RAM Share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1063,7 +1077,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 14,
|
"id": 14,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node CPU",
|
"title": "Worker Node CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1110,7 +1124,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 15,
|
"id": 15,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node RAM",
|
"title": "Worker Node RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1231,7 +1245,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 18,
|
"id": 18,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster ingress throughput",
|
"title": "Cluster Ingress Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1275,7 +1289,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 19,
|
"id": 19,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster egress throughput",
|
"title": "Cluster Egress Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1319,7 +1333,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 20,
|
"id": 20,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Intra-cluster throughput",
|
"title": "Intra-Cluster Throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1363,7 +1377,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 21,
|
"id": 21,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root Filesystem Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1411,7 +1425,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes Closest to Full Root Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1465,7 +1479,7 @@ data:
|
|||||||
"calcs": [
|
"calcs": [
|
||||||
"lastNotNull"
|
"lastNotNull"
|
||||||
],
|
],
|
||||||
"fields": "/.*/",
|
"fields": "Value",
|
||||||
"values": false
|
"values": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -1488,9 +1502,10 @@ data:
|
|||||||
"list": []
|
"list": []
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
"from": "now-12h",
|
"from": "now-1h",
|
||||||
"to": "now"
|
"to": "now"
|
||||||
},
|
},
|
||||||
|
"refresh": "1m",
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Atlas Pods",
|
"title": "Atlas Pods",
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Problem pods",
|
"title": "Problem Pods",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -136,7 +136,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Stuck terminating (>10m)",
|
"title": "Stuck Terminating (>10m)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -196,7 +196,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Control plane workloads",
|
"title": "Control Plane Workloads",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -256,7 +256,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Pods not running",
|
"title": "Pods Not Running",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -80,7 +80,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -144,7 +144,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -204,7 +204,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria Free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -264,7 +264,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Astreae per-node usage",
|
"title": "Astreae Per-Node Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -302,7 +302,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Asteria per-node usage",
|
"title": "Asteria Per-Node Usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -340,7 +340,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 7,
|
"id": 7,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Astreae usage history",
|
"title": "Astreae Usage History",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -377,7 +377,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 8,
|
"id": 8,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Asteria usage history",
|
"title": "Asteria Usage History",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
|
|||||||
@ -10,8 +10,8 @@ data:
|
|||||||
folders.yaml: |
|
folders.yaml: |
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
folders:
|
folders:
|
||||||
- uid: atlas-overview
|
- uid: overview
|
||||||
title: Atlas Overview
|
title: Overview
|
||||||
permissions:
|
permissions:
|
||||||
- role: Viewer
|
- role: Viewer
|
||||||
permission: View
|
permission: View
|
||||||
@ -26,3 +26,10 @@ data:
|
|||||||
permission: View
|
permission: View
|
||||||
- role: Admin
|
- role: Admin
|
||||||
permission: Admin
|
permission: Admin
|
||||||
|
- uid: oceanus-internal
|
||||||
|
title: Oceanus Internal
|
||||||
|
permissions:
|
||||||
|
- role: Editor
|
||||||
|
permission: View
|
||||||
|
- role: Admin
|
||||||
|
permission: Admin
|
||||||
|
|||||||
@ -256,6 +256,8 @@ spec:
|
|||||||
server:
|
server:
|
||||||
domain: metrics.bstein.dev
|
domain: metrics.bstein.dev
|
||||||
root_url: https://metrics.bstein.dev/
|
root_url: https://metrics.bstein.dev/
|
||||||
|
dashboards:
|
||||||
|
default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json
|
||||||
auth.anonymous:
|
auth.anonymous:
|
||||||
hide_version: true
|
hide_version: true
|
||||||
users:
|
users:
|
||||||
@ -290,7 +292,7 @@ spec:
|
|||||||
providers:
|
providers:
|
||||||
- name: overview
|
- name: overview
|
||||||
orgId: 1
|
orgId: 1
|
||||||
folder: Atlas Overview
|
folder: Overview
|
||||||
type: file
|
type: file
|
||||||
disableDeletion: false
|
disableDeletion: false
|
||||||
editable: false
|
editable: false
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user