monitoring: fix jetson gpu metrics

This commit is contained in:
Brad Stein 2026-01-27 16:19:30 -03:00
parent dedf566993
commit 62a423f32c
5 changed files with 28 additions and 12 deletions

View File

@ -221,6 +221,13 @@ def jetson_gpu_util_by_node():
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
def jetson_gpu_util_by_hostname():
return (
'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
'"Hostname", "$1", "node", "(.*)")'
)
def jetson_gpu_requests(scope_var):
return (
"sum by (namespace,node) ("
@ -2688,7 +2695,7 @@ def build_gpu_dashboard():
timeseries_panel(
3,
"GPU Util by Node",
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",

View File

@ -126,7 +126,7 @@
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{Hostname}}"
}

View File

@ -135,7 +135,7 @@ data:
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{Hostname}}"
}

View File

@ -17,7 +17,7 @@ spec:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
monitoring.bstein.dev/restart-rev: "1"
monitoring.bstein.dev/restart-rev: "2"
spec:
serviceAccountName: default
hostPID: true

View File

@ -4,7 +4,7 @@ import re
import socketserver
import subprocess
import threading
from time import time
from time import sleep, time
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
@ -20,6 +20,7 @@ METRICS = {
LOCK = threading.Lock()
def parse_line(line: str):
line = line.strip()
updates = {}
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
if m:
@ -34,7 +35,7 @@ def parse_line(line: str):
if m:
updates["ram_used_mb"] = float(m.group(1))
updates["ram_total_mb"] = float(m.group(2))
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
if m:
updates["power_5v_in_mw"] = float(m.group(1))
with LOCK:
@ -42,15 +43,23 @@ def parse_line(line: str):
METRICS["last_scrape_ts"] = time()
def run_tegrastats():
proc = subprocess.Popen(
["/host/usr/bin/tegrastats", "--interval", "1000"],
stdout=subprocess.PIPE,
logfile = "/tmp/tegrastats.log"
subprocess.Popen(
["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
for line in proc.stdout:
parse_line(line)
while not os.path.exists(logfile):
sleep(0.1)
with open(logfile, "r", encoding="utf-8", errors="ignore") as handle:
handle.seek(0, os.SEEK_END)
while True:
line = handle.readline()
if not line:
sleep(0.2)
continue
parse_line(line)
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):