atlas pods: add namespace plurality by node table

This commit is contained in:
Brad Stein 2025-12-13 03:57:20 -03:00
parent 8a755e0c42
commit f0265d6b94
5 changed files with 104 additions and 55 deletions

View File

@ -1171,6 +1171,20 @@ def build_pods_dashboard():
instant=True,
)
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node",
'topk(1, sum by (namespace,node) (kube_pod_info{pod!=""}) '
'/ ignoring(node) sum by (namespace) (kube_pod_info{pod!=""}))',
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}},
],
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",

View File

@ -1,27 +0,0 @@
# services/keycloak
Keycloak is deployed via raw manifests and backed by the shared Postgres (`postgres-service.postgres.svc.cluster.local:5432`). Create these secrets before applying:
```bash
# DB creds (per-service DB/user in shared Postgres)
kubectl -n sso create secret generic keycloak-db \
--from-literal=username=keycloak \
--from-literal=password='<DB_PASSWORD>' \
--from-literal=database=keycloak
# Admin console creds (maps to KC admin user)
kubectl -n sso create secret generic keycloak-admin \
--from-literal=username=brad@bstein.dev \
--from-literal=password='<ADMIN_PASSWORD>'
```
Apply:
```bash
kubectl apply -k services/keycloak
```
Notes
- Service: `keycloak.sso.svc:80` (Ingress `sso.bstein.dev`, TLS via cert-manager).
- Uses Postgres schema `public`; DB/user should be provisioned in the shared Postgres instance.
- Health endpoints on :9000 are wired for probes.

View File

@ -1,28 +0,0 @@
# services/monitoring
## Grafana admin secret
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
in the `monitoring` namespace. Create or rotate it with:
```bash
kubectl create secret generic grafana-admin \
--namespace monitoring \
--from-literal=admin-user=admin \
--from-literal=admin-password='REPLACE_ME'
```
Update the password whenever you rotate credentials.
## DCGM exporter image
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
```bash
skopeo copy \
--all \
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
```
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.

View File

@ -491,6 +491,51 @@
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "topk(1, sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / ignoring(node) sum by (namespace) (kube_pod_info{pod!=\"\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
"time": {

View File

@ -500,6 +500,51 @@ data:
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "topk(1, sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / ignoring(node) sum by (namespace) (kube_pod_info{pod!=\"\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
"time": {