monitoring: fix jetson gpu metrics
This commit is contained in:
parent
dedf566993
commit
62a423f32c
@ -221,6 +221,13 @@ def jetson_gpu_util_by_node():
|
|||||||
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
||||||
|
|
||||||
|
|
||||||
|
def jetson_gpu_util_by_hostname():
|
||||||
|
return (
|
||||||
|
'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
|
||||||
|
'"Hostname", "$1", "node", "(.*)")'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_requests(scope_var):
|
def jetson_gpu_requests(scope_var):
|
||||||
return (
|
return (
|
||||||
"sum by (namespace,node) ("
|
"sum by (namespace,node) ("
|
||||||
@ -2688,7 +2695,7 @@ def build_gpu_dashboard():
|
|||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
3,
|
3,
|
||||||
"GPU Util by Node",
|
"GPU Util by Node",
|
||||||
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{Hostname}}",
|
legend="{{Hostname}}",
|
||||||
|
|||||||
@ -126,7 +126,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
|
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{Hostname}}"
|
"legendFormat": "{{Hostname}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -135,7 +135,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
|
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{Hostname}}"
|
"legendFormat": "{{Hostname}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,7 +17,7 @@ spec:
|
|||||||
annotations:
|
annotations:
|
||||||
prometheus.io/scrape: "true"
|
prometheus.io/scrape: "true"
|
||||||
prometheus.io/port: "9100"
|
prometheus.io/port: "9100"
|
||||||
monitoring.bstein.dev/restart-rev: "1"
|
monitoring.bstein.dev/restart-rev: "2"
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: default
|
serviceAccountName: default
|
||||||
hostPID: true
|
hostPID: true
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import re
|
|||||||
import socketserver
|
import socketserver
|
||||||
import subprocess
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
from time import time
|
from time import sleep, time
|
||||||
|
|
||||||
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
|
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
|
||||||
NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
|
NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
|
||||||
@ -20,6 +20,7 @@ METRICS = {
|
|||||||
LOCK = threading.Lock()
|
LOCK = threading.Lock()
|
||||||
|
|
||||||
def parse_line(line: str):
|
def parse_line(line: str):
|
||||||
|
line = line.strip()
|
||||||
updates = {}
|
updates = {}
|
||||||
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
|
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
|
||||||
if m:
|
if m:
|
||||||
@ -34,7 +35,7 @@ def parse_line(line: str):
|
|||||||
if m:
|
if m:
|
||||||
updates["ram_used_mb"] = float(m.group(1))
|
updates["ram_used_mb"] = float(m.group(1))
|
||||||
updates["ram_total_mb"] = float(m.group(2))
|
updates["ram_total_mb"] = float(m.group(2))
|
||||||
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
|
m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
|
||||||
if m:
|
if m:
|
||||||
updates["power_5v_in_mw"] = float(m.group(1))
|
updates["power_5v_in_mw"] = float(m.group(1))
|
||||||
with LOCK:
|
with LOCK:
|
||||||
@ -42,15 +43,23 @@ def parse_line(line: str):
|
|||||||
METRICS["last_scrape_ts"] = time()
|
METRICS["last_scrape_ts"] = time()
|
||||||
|
|
||||||
def run_tegrastats():
|
def run_tegrastats():
|
||||||
proc = subprocess.Popen(
|
logfile = "/tmp/tegrastats.log"
|
||||||
["/host/usr/bin/tegrastats", "--interval", "1000"],
|
subprocess.Popen(
|
||||||
stdout=subprocess.PIPE,
|
["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
text=True,
|
text=True,
|
||||||
bufsize=1,
|
|
||||||
)
|
)
|
||||||
for line in proc.stdout:
|
while not os.path.exists(logfile):
|
||||||
parse_line(line)
|
sleep(0.1)
|
||||||
|
with open(logfile, "r", encoding="utf-8", errors="ignore") as handle:
|
||||||
|
handle.seek(0, os.SEEK_END)
|
||||||
|
while True:
|
||||||
|
line = handle.readline()
|
||||||
|
if not line:
|
||||||
|
sleep(0.2)
|
||||||
|
continue
|
||||||
|
parse_line(line)
|
||||||
|
|
||||||
class Handler(http.server.BaseHTTPRequestHandler):
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user