monitoring(gpu): add process-level utilization attribution
This commit is contained in:
parent
5513608b1a
commit
fd3da0e2ae
@ -261,7 +261,7 @@ def namespace_ram_raw(scope_var):
|
||||
|
||||
|
||||
def namespace_gpu_usage_instant(scope_var):
|
||||
return gpu_usage_by_namespace(scope_var)
|
||||
return nvidia_process_gpu_usage_by_namespace(scope_var)
|
||||
|
||||
|
||||
def jetson_gpu_util_by_node():
|
||||
@ -343,21 +343,51 @@ def namespace_ram_share_expr(scope_var):
|
||||
return namespace_share_expr(namespace_ram_raw(scope_var))
|
||||
|
||||
|
||||
def current_gpu_claim_count(scope_var):
|
||||
requests_by_ns = gpu_requests_by_namespace(scope_var)
|
||||
return f"(count(({requests_by_ns}) > 0) or on() vector(0))"
|
||||
def nvidia_process_gpu_usage_by_namespace(scope_var):
|
||||
return (
|
||||
"sum by (namespace) ("
|
||||
f"nvidia_namespace_gpu_sm_util_percent{{{namespace_gpu_selector(scope_var)}}}"
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
def nvidia_process_gpu_present():
|
||||
return "(count(nvidia_gpu_device_utilization_percent) or on() vector(0))"
|
||||
|
||||
|
||||
def gpu_capacity_percent():
|
||||
process_capacity = "100 * count(nvidia_gpu_device_utilization_percent)"
|
||||
legacy_capacity = (
|
||||
"100 * count("
|
||||
f"{gpu_util_by_node()}"
|
||||
") unless on() nvidia_gpu_device_utilization_percent"
|
||||
)
|
||||
return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))"
|
||||
|
||||
|
||||
def unattributed_gpu_usage():
|
||||
return (
|
||||
'label_replace((sum('
|
||||
f"{gpu_util_by_node()}"
|
||||
') or on() vector(0)), "namespace", "unattributed", "", "") '
|
||||
f"unless on() ({nvidia_process_gpu_present()} > 0)"
|
||||
)
|
||||
|
||||
|
||||
def namespace_gpu_share_expr(scope_var):
|
||||
utilization = (
|
||||
f"avg_over_time(({gpu_usage_by_namespace(scope_var)})[$__range:$__interval]) "
|
||||
f"and on(namespace) (({gpu_requests_by_namespace(scope_var)}) > 0)"
|
||||
utilization = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
|
||||
total = f"(sum({utilization}) or on() vector(0))"
|
||||
unused = (
|
||||
'label_replace(clamp_min('
|
||||
f"{gpu_capacity_percent()} - {total}"
|
||||
', 0), "namespace", "unused", "", "") '
|
||||
f"and on() ({total} > 0)"
|
||||
)
|
||||
idle = (
|
||||
'label_replace(vector(100), "namespace", "idle", "", "") '
|
||||
f"and on() ({current_gpu_claim_count(scope_var)} == 0)"
|
||||
f"and on() ({total} == 0)"
|
||||
)
|
||||
return f"({utilization}) or ({idle})"
|
||||
return f"({utilization}) or ({unused}) or ({idle})"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = (
|
||||
@ -1814,7 +1844,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||
"Namespace GPU Utilization": "Measured GPU utilization attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
"Namespace GPU Utilization": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.",
|
||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||
@ -2831,7 +2861,7 @@ def build_overview():
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 23},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
description="Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -5352,20 +5382,20 @@ def build_gpu_dashboard():
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
description="Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
2,
|
||||
"GPU Activity by Reservation",
|
||||
"GPU Process Util by Namespace",
|
||||
namespace_gpu_usage_instant(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
unit="percent",
|
||||
legend="{{namespace}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.",
|
||||
description="NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
|
||||
@ -155,13 +155,12 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
||||
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
|
||||
|
||||
gpu_expr = panels_by_title["Namespace GPU Utilization"]["targets"][0]["expr"]
|
||||
assert "DCGM_FI_DEV_GPU_UTIL" in gpu_expr
|
||||
assert "nvidia_namespace_gpu_sm_util_percent" in gpu_expr
|
||||
assert "nvidia_gpu_device_utilization_percent" in gpu_expr
|
||||
assert "sum by (namespace)" in gpu_expr
|
||||
assert 'namespace", "shared"' not in gpu_expr
|
||||
assert "kube_node_labels" not in gpu_expr
|
||||
assert "avg_over_time(" in gpu_expr
|
||||
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
|
||||
assert "and on(namespace)" in gpu_expr
|
||||
assert 'namespace", "unused"' in gpu_expr
|
||||
assert 'namespace", "idle"' in gpu_expr
|
||||
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -71,12 +71,12 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
"description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "GPU Activity by Reservation",
|
||||
"title": "GPU Process Util by Namespace",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -89,7 +89,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -109,7 +109,7 @@
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||
"description": "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods."
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
|
||||
@ -3728,7 +3728,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3779,7 +3779,7 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
"description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -80,12 +80,12 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
"description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "GPU Activity by Reservation",
|
||||
"title": "GPU Process Util by Namespace",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -98,7 +98,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -118,7 +118,7 @@ data:
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||
"description": "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods."
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
|
||||
@ -3737,7 +3737,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3788,7 +3788,7 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
"description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
@ -19,6 +19,7 @@ resources:
|
||||
- grafana-dashboard-testing.yaml
|
||||
- vmalert-atlas-availability.yaml
|
||||
- dcgm-exporter.yaml
|
||||
- nvidia-process-exporter.yaml
|
||||
- jetson-tegrastats-exporter.yaml
|
||||
- postmark-exporter-service.yaml
|
||||
- postmark-exporter-deployment.yaml
|
||||
@ -46,6 +47,12 @@ configMapGenerator:
|
||||
- exporter.py=scripts/jetson_tegrastats_exporter.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: nvidia-process-exporter-script
|
||||
namespace: monitoring
|
||||
files:
|
||||
- exporter.py=scripts/nvidia_process_exporter.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: monitoring-vault-entrypoint
|
||||
namespace: monitoring
|
||||
files:
|
||||
|
||||
136
services/monitoring/nvidia-process-exporter.yaml
Normal file
136
services/monitoring/nvidia-process-exporter.yaml
Normal file
@ -0,0 +1,136 @@
|
||||
# services/monitoring/nvidia-process-exporter.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nvidia-process-exporter
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: nvidia-process-exporter
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: nvidia-process-exporter
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: nvidia-process-exporter
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nvidia-process-exporter
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-process-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: nvidia-process-exporter
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nvidia-process-exporter
|
||||
updateStrategy:
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nvidia-process-exporter
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9401"
|
||||
spec:
|
||||
serviceAccountName: nvidia-process-exporter
|
||||
imagePullSecrets:
|
||||
- name: harbor-regcred
|
||||
runtimeClassName: nvidia
|
||||
hostPID: true
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values:
|
||||
- amd64
|
||||
- key: jetson
|
||||
operator: NotIn
|
||||
values:
|
||||
- "true"
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
containers:
|
||||
- name: exporter
|
||||
image: python:3.12-slim
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9401
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: all
|
||||
- name: NVIDIA_PROCESS_EXPORTER_PORT
|
||||
value: "9401"
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
command:
|
||||
- sh
|
||||
- -lc
|
||||
- |
|
||||
pip install --no-cache-dir nvidia-ml-py==13.595.45
|
||||
exec python /etc/nvidia-process-exporter/exporter.py
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 96Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /etc/nvidia-process-exporter
|
||||
readOnly: true
|
||||
- name: host-proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: nvidia-process-exporter-script
|
||||
defaultMode: 0555
|
||||
- name: host-proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
type: Directory
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nvidia-process-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: nvidia-process-exporter
|
||||
spec:
|
||||
selector:
|
||||
app: nvidia-process-exporter
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9401
|
||||
targetPort: metrics
|
||||
248
services/monitoring/scripts/nvidia_process_exporter.py
Normal file
248
services/monitoring/scripts/nvidia_process_exporter.py
Normal file
@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import ssl
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
|
||||
from pynvml import (
|
||||
NVMLError,
|
||||
NVMLError_NotFound,
|
||||
NVMLError_NotSupported,
|
||||
nvmlDeviceGetComputeRunningProcesses_v3,
|
||||
nvmlDeviceGetCount,
|
||||
nvmlDeviceGetGraphicsRunningProcesses_v3,
|
||||
nvmlDeviceGetHandleByIndex,
|
||||
nvmlDeviceGetName,
|
||||
nvmlDeviceGetProcessUtilization,
|
||||
nvmlDeviceGetUUID,
|
||||
nvmlDeviceGetUtilizationRates,
|
||||
nvmlInit,
|
||||
)
|
||||
|
||||
NODE_NAME = os.environ.get("NODE_NAME", "")
|
||||
PORT = int(os.environ.get("NVIDIA_PROCESS_EXPORTER_PORT", "9401"))
|
||||
PROC_ROOT = os.environ.get("HOST_PROC", "/host/proc")
|
||||
SAMPLE_WINDOW_MS = int(os.environ.get("NVML_PROCESS_SAMPLE_WINDOW_MS", "30000"))
|
||||
POD_CACHE_TTL = int(os.environ.get("POD_CACHE_TTL_SECONDS", "30"))
|
||||
METRIC_CACHE_TTL = int(os.environ.get("METRIC_CACHE_TTL_SECONDS", "5"))
|
||||
TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||
|
||||
POD_UID_RE = re.compile(r"pod([0-9a-fA-F_-]{32,36})")
|
||||
SAFE_LABEL_RE = re.compile(r"[^a-zA-Z0-9_:]")
|
||||
|
||||
pod_cache = {"loaded_at": 0.0, "pods": {}}
|
||||
metric_cache = {"loaded_at": 0.0, "body": ""}
|
||||
|
||||
|
||||
def label_value(value):
|
||||
return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
|
||||
|
||||
|
||||
def metric_line(name, labels, value):
|
||||
label_text = ",".join(f'{key}="{label_value(val)}"' for key, val in sorted(labels.items()))
|
||||
return f"{name}{{{label_text}}} {value}"
|
||||
|
||||
|
||||
def uid_key(value):
|
||||
return re.sub(r"[^0-9a-f]", "", value.lower())
|
||||
|
||||
|
||||
def process_name(pid):
|
||||
for path in (f"{PROC_ROOT}/{pid}/comm", f"/proc/{pid}/comm"):
|
||||
try:
|
||||
with open(path, encoding="utf-8") as handle:
|
||||
name = handle.read().strip()
|
||||
if name:
|
||||
return name
|
||||
except OSError:
|
||||
pass
|
||||
return "unknown"
|
||||
|
||||
|
||||
def process_cgroup(pid):
|
||||
for path in (f"{PROC_ROOT}/{pid}/cgroup", f"/proc/{pid}/cgroup"):
|
||||
try:
|
||||
with open(path, encoding="utf-8") as handle:
|
||||
return handle.read()
|
||||
except OSError:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def load_pods():
|
||||
now = time.time()
|
||||
if now - pod_cache["loaded_at"] < POD_CACHE_TTL:
|
||||
return pod_cache["pods"]
|
||||
|
||||
host = os.environ.get("KUBERNETES_SERVICE_HOST")
|
||||
port = os.environ.get("KUBERNETES_SERVICE_PORT", "443")
|
||||
if not host or not NODE_NAME:
|
||||
return {}
|
||||
|
||||
with open(TOKEN_PATH, encoding="utf-8") as handle:
|
||||
token = handle.read().strip()
|
||||
|
||||
selector = urllib.parse.quote(f"spec.nodeName={NODE_NAME}", safe="")
|
||||
url = f"https://{host}:{port}/api/v1/pods?fieldSelector={selector}"
|
||||
request = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
|
||||
context = ssl.create_default_context(cafile=CA_PATH)
|
||||
with urllib.request.urlopen(request, context=context, timeout=10) as response:
|
||||
payload = json.load(response)
|
||||
|
||||
pods = {}
|
||||
for item in payload.get("items", []):
|
||||
metadata = item.get("metadata", {})
|
||||
uid = metadata.get("uid", "")
|
||||
if not uid:
|
||||
continue
|
||||
pods[uid_key(uid)] = {
|
||||
"namespace": metadata.get("namespace", "unknown"),
|
||||
"pod": metadata.get("name", "unknown"),
|
||||
}
|
||||
|
||||
pod_cache["loaded_at"] = now
|
||||
pod_cache["pods"] = pods
|
||||
return pods
|
||||
|
||||
|
||||
def pod_for_pid(pid, pods):
|
||||
cgroup = process_cgroup(pid)
|
||||
match = POD_UID_RE.search(cgroup)
|
||||
if not match:
|
||||
return {"namespace": "host", "pod": "host"}
|
||||
return pods.get(uid_key(match.group(1)), {"namespace": "unknown", "pod": "unknown"})
|
||||
|
||||
|
||||
def running_process_memory(handle):
|
||||
processes = {}
|
||||
for proc_type, getter in (("compute", nvmlDeviceGetComputeRunningProcesses_v3), ("graphics", nvmlDeviceGetGraphicsRunningProcesses_v3)):
|
||||
try:
|
||||
for proc in getter(handle):
|
||||
entry = processes.setdefault(int(proc.pid), {"memory": 0, "types": set()})
|
||||
entry["memory"] += int(proc.usedGpuMemory or 0)
|
||||
entry["types"].add(proc_type)
|
||||
except (NVMLError_NotFound, NVMLError_NotSupported):
|
||||
continue
|
||||
return processes
|
||||
|
||||
|
||||
def process_utilization_samples(handle):
|
||||
try:
|
||||
since = int(time.time() * 1000) - SAMPLE_WINDOW_MS
|
||||
samples = nvmlDeviceGetProcessUtilization(handle, since)
|
||||
except NVMLError_NotFound:
|
||||
return {}, 1
|
||||
except NVMLError_NotSupported:
|
||||
return {}, 0
|
||||
|
||||
by_pid = {}
|
||||
for sample in samples:
|
||||
pid = int(sample.pid)
|
||||
current = by_pid.get(pid)
|
||||
if current is None or sample.timeStamp >= current["timestamp"]:
|
||||
by_pid[pid] = {
|
||||
"timestamp": int(sample.timeStamp),
|
||||
"sm": int(sample.smUtil),
|
||||
"memory": int(sample.memUtil),
|
||||
"enc": int(sample.encUtil),
|
||||
"dec": int(sample.decUtil),
|
||||
}
|
||||
return by_pid, 1
|
||||
|
||||
|
||||
def collect_metrics():
|
||||
nvmlInit()
|
||||
pods = load_pods()
|
||||
lines = [
|
||||
"# HELP nvidia_gpu_device_utilization_percent Current NVML device GPU utilization.",
|
||||
"# TYPE nvidia_gpu_device_utilization_percent gauge",
|
||||
"# HELP nvidia_process_gpu_sm_util_percent Recent per-process SM utilization from NVML.",
|
||||
"# TYPE nvidia_process_gpu_sm_util_percent gauge",
|
||||
"# HELP nvidia_process_gpu_memory_used_bytes GPU memory held by a process.",
|
||||
"# TYPE nvidia_process_gpu_memory_used_bytes gauge",
|
||||
"# HELP nvidia_namespace_gpu_sm_util_percent GPU SM utilization attributed to namespace, with host/unattributed residual included.",
|
||||
"# TYPE nvidia_namespace_gpu_sm_util_percent gauge",
|
||||
"# HELP nvidia_gpu_process_utilization_supported Whether NVML process utilization samples are available for the device.",
|
||||
"# TYPE nvidia_gpu_process_utilization_supported gauge",
|
||||
]
|
||||
|
||||
for gpu_index in range(nvmlDeviceGetCount()):
|
||||
handle = nvmlDeviceGetHandleByIndex(gpu_index)
|
||||
uuid = nvmlDeviceGetUUID(handle)
|
||||
name = nvmlDeviceGetName(handle)
|
||||
device_util = float(nvmlDeviceGetUtilizationRates(handle).gpu)
|
||||
base = {"node": NODE_NAME, "gpu": gpu_index, "uuid": uuid, "model": name}
|
||||
lines.append(metric_line("nvidia_gpu_device_utilization_percent", base, device_util))
|
||||
|
||||
memory_by_pid = running_process_memory(handle)
|
||||
util_by_pid, supported = process_utilization_samples(handle)
|
||||
lines.append(metric_line("nvidia_gpu_process_utilization_supported", base, supported))
|
||||
|
||||
namespace_sm = {}
|
||||
for pid in sorted(set(memory_by_pid) | set(util_by_pid)):
|
||||
proc_info = memory_by_pid.get(pid, {"memory": 0, "types": set()})
|
||||
util_info = util_by_pid.get(pid, {"sm": 0, "memory": 0, "enc": 0, "dec": 0})
|
||||
pod = pod_for_pid(pid, pods)
|
||||
proc_name = process_name(pid)
|
||||
proc_type = "+".join(sorted(proc_info["types"])) or "unknown"
|
||||
labels = {
|
||||
**base,
|
||||
"namespace": pod["namespace"],
|
||||
"pod": pod["pod"],
|
||||
"pid": pid,
|
||||
"process": proc_name,
|
||||
"type": proc_type,
|
||||
}
|
||||
sm_util = float(util_info["sm"])
|
||||
namespace_sm[pod["namespace"]] = namespace_sm.get(pod["namespace"], 0.0) + sm_util
|
||||
lines.append(metric_line("nvidia_process_gpu_sm_util_percent", labels, sm_util))
|
||||
lines.append(metric_line("nvidia_process_gpu_memory_used_bytes", labels, int(proc_info["memory"])))
|
||||
|
||||
attributed = sum(namespace_sm.values())
|
||||
residual = max(device_util - attributed, 0.0)
|
||||
if residual > 0.1:
|
||||
namespace_sm["host"] = namespace_sm.get("host", 0.0) + residual
|
||||
|
||||
for namespace, value in sorted(namespace_sm.items()):
|
||||
labels = {**base, "namespace": namespace, "pod": "__namespace_total__"}
|
||||
lines.append(metric_line("nvidia_namespace_gpu_sm_util_percent", labels, round(value, 3)))
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path not in ("/metrics", "/"):
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
now = time.time()
|
||||
if now - metric_cache["loaded_at"] >= METRIC_CACHE_TTL:
|
||||
try:
|
||||
metric_cache["body"] = collect_metrics()
|
||||
except (NVMLError, OSError, subprocess.SubprocessError, urllib.error.URLError) as exc:
|
||||
metric_cache["body"] = (
|
||||
"# HELP nvidia_process_exporter_up Whether the NVIDIA process exporter scrape succeeded.\n"
|
||||
"# TYPE nvidia_process_exporter_up gauge\n"
|
||||
f'nvidia_process_exporter_up{{node="{label_value(NODE_NAME)}",error="{label_value(type(exc).__name__)}"}} 0\n'
|
||||
)
|
||||
metric_cache["loaded_at"] = now
|
||||
body = metric_cache["body"].encode("utf-8")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ThreadingHTTPServer(("0.0.0.0", PORT), MetricsHandler).serve_forever()
|
||||
Loading…
x
Reference in New Issue
Block a user