monitoring: read jetson stats on demand

This commit is contained in:
Brad Stein 2026-01-27 16:27:45 -03:00
parent 1951291090
commit 246ed6617e
2 changed files with 13 additions and 16 deletions

View File

@ -17,7 +17,7 @@ spec:
annotations: annotations:
prometheus.io/scrape: "true" prometheus.io/scrape: "true"
prometheus.io/port: "9100" prometheus.io/port: "9100"
monitoring.bstein.dev/restart-rev: "3" monitoring.bstein.dev/restart-rev: "4"
spec: spec:
serviceAccountName: default serviceAccountName: default
hostPID: true hostPID: true

View File

@ -3,13 +3,12 @@ import os
import re import re
import socketserver import socketserver
import subprocess import subprocess
import threading
from time import time from time import time
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
LOGFILE = "/tmp/tegrastats.log" LOGFILE = "/tmp/tegrastats.log"
METRICS = { BASE_METRICS = {
"gr3d_freq_percent": 0.0, "gr3d_freq_percent": 0.0,
"gpu_temp_c": 0.0, "gpu_temp_c": 0.0,
"cpu_temp_c": 0.0, "cpu_temp_c": 0.0,
@ -18,9 +17,8 @@ METRICS = {
"power_5v_in_mw": 0.0, "power_5v_in_mw": 0.0,
"last_scrape_ts": 0.0, "last_scrape_ts": 0.0,
} }
LOCK = threading.Lock()
def parse_line(line: str): def parse_line(line: str) -> dict:
line = line.strip() line = line.strip()
updates = {} updates = {}
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
@ -39,9 +37,7 @@ def parse_line(line: str):
m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
if m: if m:
updates["power_5v_in_mw"] = float(m.group(1)) updates["power_5v_in_mw"] = float(m.group(1))
with LOCK: return updates
METRICS.update(updates)
METRICS["last_scrape_ts"] = time()
def start_tegrastats(): def start_tegrastats():
subprocess.Popen( subprocess.Popen(
@ -52,19 +48,18 @@ def start_tegrastats():
) )
def refresh_from_log(): def read_latest_line() -> str:
if not os.path.exists(LOGFILE): if not os.path.exists(LOGFILE):
return return ""
try: try:
with open(LOGFILE, "rb") as handle: with open(LOGFILE, "rb") as handle:
handle.seek(0, os.SEEK_END) handle.seek(0, os.SEEK_END)
size = handle.tell() size = handle.tell()
handle.seek(max(size - 4096, 0), os.SEEK_SET) handle.seek(max(size - 4096, 0), os.SEEK_SET)
tail = handle.read().decode("utf-8", errors="ignore").splitlines() tail = handle.read().decode("utf-8", errors="ignore").splitlines()
if tail: return tail[-1] if tail else ""
parse_line(tail[-1])
except OSError: except OSError:
return return ""
class Handler(http.server.BaseHTTPRequestHandler): class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self): def do_GET(self):
@ -72,9 +67,11 @@ class Handler(http.server.BaseHTTPRequestHandler):
self.send_response(404) self.send_response(404)
self.end_headers() self.end_headers()
return return
refresh_from_log() metrics = BASE_METRICS.copy()
with LOCK: line = read_latest_line()
metrics = METRICS.copy() if line:
metrics.update(parse_line(line))
metrics["last_scrape_ts"] = time()
out = [] out = []
label = f'{{node="{NODE_NAME}"}}' label = f'{{node="{NODE_NAME}"}}'
for k, v in metrics.items(): for k, v in metrics.items():