diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 675fec5..6ad4321 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -221,6 +221,13 @@ def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' +def jetson_gpu_util_by_hostname(): + return ( + 'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), ' + '"Hostname", "$1", "node", "(.*)")' + ) + + def jetson_gpu_requests(scope_var): return ( "sum by (namespace,node) (" @@ -2688,7 +2695,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 6b76a5c..36ab9e5 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 46b25cd..bb395db 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8584eba..0074394 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "1" + monitoring.bstein.dev/restart-rev: "2" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index c237ec5..3858d96 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -4,7 +4,7 @@ import re import socketserver import subprocess import threading -from time import time +from time import sleep, time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename @@ -20,6 +20,7 @@ METRICS = { LOCK = threading.Lock() def parse_line(line: str): + line = line.strip() updates = {} m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) if m: @@ -34,7 +35,7 @@ def parse_line(line: str): if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) with LOCK: @@ -42,15 +43,23 @@ def parse_line(line: str): METRICS["last_scrape_ts"] = time() def run_tegrastats(): - proc = subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000"], - stdout=subprocess.PIPE, + logfile = "/tmp/tegrastats.log" + subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile], + stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, text=True, - bufsize=1, ) - for line in proc.stdout: - parse_line(line) + while not os.path.exists(logfile): + sleep(0.1) + with open(logfile, "r", encoding="utf-8", errors="ignore") as handle: + handle.seek(0, os.SEEK_END) + while True: + line = handle.readline() + if not line: + sleep(0.2) + continue + parse_line(line) class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self):