diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c61a72cf..bf02b442 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -261,7 +261,7 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - return gpu_usage_by_namespace(scope_var) + return nvidia_process_gpu_usage_by_namespace(scope_var) def jetson_gpu_util_by_node(): @@ -343,21 +343,51 @@ def namespace_ram_share_expr(scope_var): return namespace_share_expr(namespace_ram_raw(scope_var)) -def current_gpu_claim_count(scope_var): - requests_by_ns = gpu_requests_by_namespace(scope_var) - return f"(count(({requests_by_ns}) > 0) or on() vector(0))" +def nvidia_process_gpu_usage_by_namespace(scope_var): + return ( + "sum by (namespace) (" + f"nvidia_namespace_gpu_sm_util_percent{{{namespace_gpu_selector(scope_var)}}}" + ")" + ) + + +def nvidia_process_gpu_present(): + return "(count(nvidia_gpu_device_utilization_percent) or on() vector(0))" + + +def gpu_capacity_percent(): + process_capacity = "100 * count(nvidia_gpu_device_utilization_percent)" + legacy_capacity = ( + "100 * count(" + f"{gpu_util_by_node()}" + ") unless on() nvidia_gpu_device_utilization_percent" + ) + return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))" + + +def unattributed_gpu_usage(): + return ( + 'label_replace((sum(' + f"{gpu_util_by_node()}" + ') or on() vector(0)), "namespace", "unattributed", "", "") ' + f"unless on() ({nvidia_process_gpu_present()} > 0)" + ) def namespace_gpu_share_expr(scope_var): - utilization = ( - f"avg_over_time(({gpu_usage_by_namespace(scope_var)})[$__range:$__interval]) " - f"and on(namespace) (({gpu_requests_by_namespace(scope_var)}) > 0)" + utilization = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})" + total = f"(sum({utilization}) or on() vector(0))" + unused = ( + 'label_replace(clamp_min(' + f"{gpu_capacity_percent()} - {total}" + ', 0), "namespace", "unused", "", "") ' + f"and on() ({total} > 0)" ) idle = ( 'label_replace(vector(100), "namespace", "idle", "", "") ' - f"and on() ({current_gpu_claim_count(scope_var)} == 0)" + f"and on() ({total} == 0)" ) - return f"({utilization}) or ({idle})" + return f"({utilization}) or ({unused}) or ({idle})" PROBLEM_PODS_EXPR = ( @@ -1814,7 +1844,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.", "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.", "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.", - "Namespace GPU Utilization": "Measured GPU utilization attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.", + "Namespace GPU Utilization": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.", "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.", "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.", "Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.", @@ -2831,7 +2861,7 @@ def build_overview(): namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 23}, links=namespace_scope_links("namespace_scope_gpu"), - description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.", + description="Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.", ) ) panels.append( @@ -5352,20 +5382,20 @@ def build_gpu_dashboard(): namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), - description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.", + description="Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity.", ) ) panels.append( timeseries_panel( 2, - "GPU Activity by Reservation", + "GPU Process Util by Namespace", namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", legend_display="table", legend_placement="right", - description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.", + description="NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods.", ) ) panels.append( diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 370fccba..9b641ab0 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -155,13 +155,12 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr gpu_expr = panels_by_title["Namespace GPU Utilization"]["targets"][0]["expr"] - assert "DCGM_FI_DEV_GPU_UTIL" in gpu_expr + assert "nvidia_namespace_gpu_sm_util_percent" in gpu_expr + assert "nvidia_gpu_device_utilization_percent" in gpu_expr assert "sum by (namespace)" in gpu_expr assert 'namespace", "shared"' not in gpu_expr assert "kube_node_labels" not in gpu_expr - assert "avg_over_time(" in gpu_expr - assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr - assert "and on(namespace)" in gpu_expr + assert 'namespace", "unused"' in gpu_expr assert 'namespace", "idle"' in gpu_expr diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index ac4eaf6c..b5cbb675 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -71,12 +71,12 @@ "targetBlank": false } ], - "description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU." + "description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity." }, { "id": 2, "type": "timeseries", - "title": "GPU Activity by Reservation", + "title": "GPU Process Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", + "expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})", "refId": "A", "legendFormat": "{{namespace}}" } @@ -109,7 +109,7 @@ "mode": "multi" } }, - "description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node." + "description": "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods." }, { "id": 3, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1946e34d..500bed23 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3728,7 +3728,7 @@ }, "targets": [ { - "expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3779,7 +3779,7 @@ "targetBlank": false } ], - "description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU." + "description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity." }, { "id": 13, diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 1a8cad4f..21c8716b 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -80,12 +80,12 @@ data: "targetBlank": false } ], - "description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU." + "description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity." }, { "id": 2, "type": "timeseries", - "title": "GPU Activity by Reservation", + "title": "GPU Process Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", + "expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})", "refId": "A", "legendFormat": "{{namespace}}" } @@ -118,7 +118,7 @@ data: "mode": "multi" } }, - "description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node." + "description": "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods." }, { "id": 3, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index c15226b5..add4b664 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3737,7 +3737,7 @@ data: }, "targets": [ { - "expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3788,7 +3788,7 @@ data: "targetBlank": false } ], - "description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU." + "description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity." }, { "id": 13, diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 8e1c33bd..16df6f3f 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -19,6 +19,7 @@ resources: - grafana-dashboard-testing.yaml - vmalert-atlas-availability.yaml - dcgm-exporter.yaml + - nvidia-process-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml - postmark-exporter-deployment.yaml @@ -46,6 +47,12 @@ configMapGenerator: - exporter.py=scripts/jetson_tegrastats_exporter.py options: disableNameSuffixHash: true + - name: nvidia-process-exporter-script + namespace: monitoring + files: + - exporter.py=scripts/nvidia_process_exporter.py + options: + disableNameSuffixHash: true - name: monitoring-vault-entrypoint namespace: monitoring files: diff --git a/services/monitoring/nvidia-process-exporter.yaml b/services/monitoring/nvidia-process-exporter.yaml new file mode 100644 index 00000000..2a38b4ef --- /dev/null +++ b/services/monitoring/nvidia-process-exporter.yaml @@ -0,0 +1,136 @@ +# services/monitoring/nvidia-process-exporter.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-process-exporter + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-process-exporter +rules: + - apiGroups: [""] + resources: + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-process-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: nvidia-process-exporter +subjects: + - kind: ServiceAccount + name: nvidia-process-exporter + namespace: monitoring +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-process-exporter + namespace: monitoring + labels: + app: nvidia-process-exporter +spec: + selector: + matchLabels: + app: nvidia-process-exporter + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + template: + metadata: + labels: + app: nvidia-process-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9401" + spec: + serviceAccountName: nvidia-process-exporter + imagePullSecrets: + - name: harbor-regcred + runtimeClassName: nvidia + hostPID: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: jetson + operator: NotIn + values: + - "true" + tolerations: + - operator: Exists + containers: + - name: exporter + image: python:3.12-slim + imagePullPolicy: IfNotPresent + ports: + - name: metrics + containerPort: 9401 + env: + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + - name: NVIDIA_PROCESS_EXPORTER_PORT + value: "9401" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - sh + - -lc + - | + pip install --no-cache-dir nvidia-ml-py==13.595.45 + exec python /etc/nvidia-process-exporter/exporter.py + securityContext: + privileged: true + resources: + requests: + cpu: 50m + memory: 96Mi + limits: + cpu: 250m + memory: 256Mi + volumeMounts: + - name: script + mountPath: /etc/nvidia-process-exporter + readOnly: true + - name: host-proc + mountPath: /host/proc + readOnly: true + volumes: + - name: script + configMap: + name: nvidia-process-exporter-script + defaultMode: 0555 + - name: host-proc + hostPath: + path: /proc + type: Directory +--- +apiVersion: v1 +kind: Service +metadata: + name: nvidia-process-exporter + namespace: monitoring + labels: + app: nvidia-process-exporter +spec: + selector: + app: nvidia-process-exporter + ports: + - name: metrics + port: 9401 + targetPort: metrics diff --git a/services/monitoring/scripts/nvidia_process_exporter.py b/services/monitoring/scripts/nvidia_process_exporter.py new file mode 100644 index 00000000..a43b2b45 --- /dev/null +++ b/services/monitoring/scripts/nvidia_process_exporter.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +import json +import os +import re +import ssl +import subprocess +import time +import urllib.parse +import urllib.request +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +from pynvml import ( + NVMLError, + NVMLError_NotFound, + NVMLError_NotSupported, + nvmlDeviceGetComputeRunningProcesses_v3, + nvmlDeviceGetCount, + nvmlDeviceGetGraphicsRunningProcesses_v3, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetName, + nvmlDeviceGetProcessUtilization, + nvmlDeviceGetUUID, + nvmlDeviceGetUtilizationRates, + nvmlInit, +) + +NODE_NAME = os.environ.get("NODE_NAME", "") +PORT = int(os.environ.get("NVIDIA_PROCESS_EXPORTER_PORT", "9401")) +PROC_ROOT = os.environ.get("HOST_PROC", "/host/proc") +SAMPLE_WINDOW_MS = int(os.environ.get("NVML_PROCESS_SAMPLE_WINDOW_MS", "30000")) +POD_CACHE_TTL = int(os.environ.get("POD_CACHE_TTL_SECONDS", "30")) +METRIC_CACHE_TTL = int(os.environ.get("METRIC_CACHE_TTL_SECONDS", "5")) +TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + +POD_UID_RE = re.compile(r"pod([0-9a-fA-F_-]{32,36})") +SAFE_LABEL_RE = re.compile(r"[^a-zA-Z0-9_:]") + +pod_cache = {"loaded_at": 0.0, "pods": {}} +metric_cache = {"loaded_at": 0.0, "body": ""} + + +def label_value(value): + return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"') + + +def metric_line(name, labels, value): + label_text = ",".join(f'{key}="{label_value(val)}"' for key, val in sorted(labels.items())) + return f"{name}{{{label_text}}} {value}" + + +def uid_key(value): + return re.sub(r"[^0-9a-f]", "", value.lower()) + + +def process_name(pid): + for path in (f"{PROC_ROOT}/{pid}/comm", f"/proc/{pid}/comm"): + try: + with open(path, encoding="utf-8") as handle: + name = handle.read().strip() + if name: + return name + except OSError: + pass + return "unknown" + + +def process_cgroup(pid): + for path in (f"{PROC_ROOT}/{pid}/cgroup", f"/proc/{pid}/cgroup"): + try: + with open(path, encoding="utf-8") as handle: + return handle.read() + except OSError: + pass + return "" + + +def load_pods(): + now = time.time() + if now - pod_cache["loaded_at"] < POD_CACHE_TTL: + return pod_cache["pods"] + + host = os.environ.get("KUBERNETES_SERVICE_HOST") + port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") + if not host or not NODE_NAME: + return {} + + with open(TOKEN_PATH, encoding="utf-8") as handle: + token = handle.read().strip() + + selector = urllib.parse.quote(f"spec.nodeName={NODE_NAME}", safe="") + url = f"https://{host}:{port}/api/v1/pods?fieldSelector={selector}" + request = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"}) + context = ssl.create_default_context(cafile=CA_PATH) + with urllib.request.urlopen(request, context=context, timeout=10) as response: + payload = json.load(response) + + pods = {} + for item in payload.get("items", []): + metadata = item.get("metadata", {}) + uid = metadata.get("uid", "") + if not uid: + continue + pods[uid_key(uid)] = { + "namespace": metadata.get("namespace", "unknown"), + "pod": metadata.get("name", "unknown"), + } + + pod_cache["loaded_at"] = now + pod_cache["pods"] = pods + return pods + + +def pod_for_pid(pid, pods): + cgroup = process_cgroup(pid) + match = POD_UID_RE.search(cgroup) + if not match: + return {"namespace": "host", "pod": "host"} + return pods.get(uid_key(match.group(1)), {"namespace": "unknown", "pod": "unknown"}) + + +def running_process_memory(handle): + processes = {} + for proc_type, getter in (("compute", nvmlDeviceGetComputeRunningProcesses_v3), ("graphics", nvmlDeviceGetGraphicsRunningProcesses_v3)): + try: + for proc in getter(handle): + entry = processes.setdefault(int(proc.pid), {"memory": 0, "types": set()}) + entry["memory"] += int(proc.usedGpuMemory or 0) + entry["types"].add(proc_type) + except (NVMLError_NotFound, NVMLError_NotSupported): + continue + return processes + + +def process_utilization_samples(handle): + try: + since = int(time.time() * 1000) - SAMPLE_WINDOW_MS + samples = nvmlDeviceGetProcessUtilization(handle, since) + except NVMLError_NotFound: + return {}, 1 + except NVMLError_NotSupported: + return {}, 0 + + by_pid = {} + for sample in samples: + pid = int(sample.pid) + current = by_pid.get(pid) + if current is None or sample.timeStamp >= current["timestamp"]: + by_pid[pid] = { + "timestamp": int(sample.timeStamp), + "sm": int(sample.smUtil), + "memory": int(sample.memUtil), + "enc": int(sample.encUtil), + "dec": int(sample.decUtil), + } + return by_pid, 1 + + +def collect_metrics(): + nvmlInit() + pods = load_pods() + lines = [ + "# HELP nvidia_gpu_device_utilization_percent Current NVML device GPU utilization.", + "# TYPE nvidia_gpu_device_utilization_percent gauge", + "# HELP nvidia_process_gpu_sm_util_percent Recent per-process SM utilization from NVML.", + "# TYPE nvidia_process_gpu_sm_util_percent gauge", + "# HELP nvidia_process_gpu_memory_used_bytes GPU memory held by a process.", + "# TYPE nvidia_process_gpu_memory_used_bytes gauge", + "# HELP nvidia_namespace_gpu_sm_util_percent GPU SM utilization attributed to namespace, with host/unattributed residual included.", + "# TYPE nvidia_namespace_gpu_sm_util_percent gauge", + "# HELP nvidia_gpu_process_utilization_supported Whether NVML process utilization samples are available for the device.", + "# TYPE nvidia_gpu_process_utilization_supported gauge", + ] + + for gpu_index in range(nvmlDeviceGetCount()): + handle = nvmlDeviceGetHandleByIndex(gpu_index) + uuid = nvmlDeviceGetUUID(handle) + name = nvmlDeviceGetName(handle) + device_util = float(nvmlDeviceGetUtilizationRates(handle).gpu) + base = {"node": NODE_NAME, "gpu": gpu_index, "uuid": uuid, "model": name} + lines.append(metric_line("nvidia_gpu_device_utilization_percent", base, device_util)) + + memory_by_pid = running_process_memory(handle) + util_by_pid, supported = process_utilization_samples(handle) + lines.append(metric_line("nvidia_gpu_process_utilization_supported", base, supported)) + + namespace_sm = {} + for pid in sorted(set(memory_by_pid) | set(util_by_pid)): + proc_info = memory_by_pid.get(pid, {"memory": 0, "types": set()}) + util_info = util_by_pid.get(pid, {"sm": 0, "memory": 0, "enc": 0, "dec": 0}) + pod = pod_for_pid(pid, pods) + proc_name = process_name(pid) + proc_type = "+".join(sorted(proc_info["types"])) or "unknown" + labels = { + **base, + "namespace": pod["namespace"], + "pod": pod["pod"], + "pid": pid, + "process": proc_name, + "type": proc_type, + } + sm_util = float(util_info["sm"]) + namespace_sm[pod["namespace"]] = namespace_sm.get(pod["namespace"], 0.0) + sm_util + lines.append(metric_line("nvidia_process_gpu_sm_util_percent", labels, sm_util)) + lines.append(metric_line("nvidia_process_gpu_memory_used_bytes", labels, int(proc_info["memory"]))) + + attributed = sum(namespace_sm.values()) + residual = max(device_util - attributed, 0.0) + if residual > 0.1: + namespace_sm["host"] = namespace_sm.get("host", 0.0) + residual + + for namespace, value in sorted(namespace_sm.items()): + labels = {**base, "namespace": namespace, "pod": "__namespace_total__"} + lines.append(metric_line("nvidia_namespace_gpu_sm_util_percent", labels, round(value, 3))) + + return "\n".join(lines) + "\n" + + +class MetricsHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path not in ("/metrics", "/"): + self.send_response(404) + self.end_headers() + return + now = time.time() + if now - metric_cache["loaded_at"] >= METRIC_CACHE_TTL: + try: + metric_cache["body"] = collect_metrics() + except (NVMLError, OSError, subprocess.SubprocessError, urllib.error.URLError) as exc: + metric_cache["body"] = ( + "# HELP nvidia_process_exporter_up Whether the NVIDIA process exporter scrape succeeded.\n" + "# TYPE nvidia_process_exporter_up gauge\n" + f'nvidia_process_exporter_up{{node="{label_value(NODE_NAME)}",error="{label_value(type(exc).__name__)}"}} 0\n' + ) + metric_cache["loaded_at"] = now + body = metric_cache["body"].encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt, *args): + return + + +if __name__ == "__main__": + ThreadingHTTPServer(("0.0.0.0", PORT), MetricsHandler).serve_forever()