monitoring: fix jetson gpu metrics
This commit is contained in:
parent
dedf566993
commit
62a423f32c
@ -221,6 +221,13 @@ def jetson_gpu_util_by_node():
|
||||
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
||||
|
||||
|
||||
def jetson_gpu_util_by_hostname():
|
||||
return (
|
||||
'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
|
||||
'"Hostname", "$1", "node", "(.*)")'
|
||||
)
|
||||
|
||||
|
||||
def jetson_gpu_requests(scope_var):
|
||||
return (
|
||||
"sum by (namespace,node) ("
|
||||
@ -2688,7 +2695,7 @@ def build_gpu_dashboard():
|
||||
timeseries_panel(
|
||||
3,
|
||||
"GPU Util by Node",
|
||||
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
||||
f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
|
||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
unit="percent",
|
||||
legend="{{Hostname}}",
|
||||
|
||||
@ -126,7 +126,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
|
||||
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{Hostname}}"
|
||||
}
|
||||
|
||||
@ -135,7 +135,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
|
||||
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{Hostname}}"
|
||||
}
|
||||
|
||||
@ -17,7 +17,7 @@ spec:
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9100"
|
||||
monitoring.bstein.dev/restart-rev: "1"
|
||||
monitoring.bstein.dev/restart-rev: "2"
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
hostPID: true
|
||||
|
||||
@ -4,7 +4,7 @@ import re
|
||||
import socketserver
|
||||
import subprocess
|
||||
import threading
|
||||
from time import time
|
||||
from time import sleep, time
|
||||
|
||||
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
|
||||
NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
|
||||
@ -20,6 +20,7 @@ METRICS = {
|
||||
LOCK = threading.Lock()
|
||||
|
||||
def parse_line(line: str):
|
||||
line = line.strip()
|
||||
updates = {}
|
||||
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
|
||||
if m:
|
||||
@ -34,7 +35,7 @@ def parse_line(line: str):
|
||||
if m:
|
||||
updates["ram_used_mb"] = float(m.group(1))
|
||||
updates["ram_total_mb"] = float(m.group(2))
|
||||
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
|
||||
m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
|
||||
if m:
|
||||
updates["power_5v_in_mw"] = float(m.group(1))
|
||||
with LOCK:
|
||||
@ -42,15 +43,23 @@ def parse_line(line: str):
|
||||
METRICS["last_scrape_ts"] = time()
|
||||
|
||||
def run_tegrastats():
|
||||
proc = subprocess.Popen(
|
||||
["/host/usr/bin/tegrastats", "--interval", "1000"],
|
||||
stdout=subprocess.PIPE,
|
||||
logfile = "/tmp/tegrastats.log"
|
||||
subprocess.Popen(
|
||||
["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
)
|
||||
for line in proc.stdout:
|
||||
parse_line(line)
|
||||
while not os.path.exists(logfile):
|
||||
sleep(0.1)
|
||||
with open(logfile, "r", encoding="utf-8", errors="ignore") as handle:
|
||||
handle.seek(0, os.SEEK_END)
|
||||
while True:
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
sleep(0.2)
|
||||
continue
|
||||
parse_line(line)
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user