titan-iac/services/monitoring/jetson-tegrastats-exporter.yaml

169 lines
4.7 KiB
YAML

# services/monitoring/jetson-tegrastats-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: jetson-tegrastats-exporter
namespace: monitoring
labels:
app: jetson-tegrastats-exporter
spec:
selector:
matchLabels:
app: jetson-tegrastats-exporter
template:
metadata:
labels:
app: jetson-tegrastats-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
serviceAccountName: default
hostPID: true
tolerations:
- operator: Exists
nodeSelector:
jetson: "true"
containers:
- name: exporter
# Exposes tegrastats output as Prometheus metrics for Jetson devices.
image: python:3.10-slim
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
ports:
- name: metrics
containerPort: 9100
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
env:
- name: JETSON_EXPORTER_PORT
value: "9100"
volumeMounts:
- name: script
mountPath: /etc/tegrastats-exporter
readOnly: true
- name: tegrastats-bin
mountPath: /host/usr/bin/tegrastats
readOnly: true
command:
- python
- /etc/tegrastats-exporter/exporter.py
volumes:
- name: script
configMap:
name: jetson-tegrastats-exporter-script
defaultMode: 0555
- name: tegrastats-bin
hostPath:
path: /usr/bin/tegrastats
type: File
---
apiVersion: v1
kind: Service
metadata:
name: jetson-tegrastats-exporter
namespace: monitoring
labels:
app: jetson-tegrastats-exporter
spec:
selector:
app: jetson-tegrastats-exporter
ports:
- name: metrics
port: 9100
targetPort: metrics
---
apiVersion: v1
kind: ConfigMap
metadata:
name: jetson-tegrastats-exporter-script
namespace: monitoring
data:
exporter.py: |
import http.server
import os
import re
import socketserver
import subprocess
import threading
from time import time
PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
METRICS = {
"gr3d_freq_percent": 0.0,
"gpu_temp_c": 0.0,
"cpu_temp_c": 0.0,
"ram_used_mb": 0.0,
"ram_total_mb": 0.0,
"power_5v_in_mw": 0.0,
"last_scrape_ts": 0.0,
}
LOCK = threading.Lock()
def parse_line(line: str):
updates = {}
m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
if m:
updates["gr3d_freq_percent"] = float(m.group(1))
m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
if m:
updates["gpu_temp_c"] = float(m.group(1))
m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
if m:
updates["cpu_temp_c"] = float(m.group(1))
m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
if m:
updates["ram_used_mb"] = float(m.group(1))
updates["ram_total_mb"] = float(m.group(2))
m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
if m:
updates["power_5v_in_mw"] = float(m.group(1))
with LOCK:
METRICS.update(updates)
METRICS["last_scrape_ts"] = time()
def run_tegrastats():
proc = subprocess.Popen(
["/host/usr/bin/tegrastats", "--interval", "1000"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
for line in proc.stdout:
parse_line(line)
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path != "/metrics":
self.send_response(404)
self.end_headers()
return
with LOCK:
metrics = METRICS.copy()
out = []
for k, v in metrics.items():
out.append(f"# TYPE jetson_{k} gauge")
out.append(f"jetson_{k} {v}")
body = "\\n".join(out) + "\\n"
self.send_response(200)
self.send_header("Content-Type", "text/plain; version=0.0.4")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body.encode("utf-8"))
def log_message(self, fmt, *args):
return
if __name__ == "__main__":
t = threading.Thread(target=run_tegrastats, daemon=True)
t.start()
with socketserver.TCPServer(("", PORT), Handler) as httpd:
httpd.serve_forever()