atlas pods: add namespace plurality by node table
This commit is contained in:
parent
8a755e0c42
commit
f0265d6b94
@ -1171,6 +1171,20 @@ def build_pods_dashboard():
|
|||||||
instant=True,
|
instant=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
panels.append(
|
||||||
|
table_panel(
|
||||||
|
10,
|
||||||
|
"Namespace Plurality by Node",
|
||||||
|
'topk(1, sum by (namespace,node) (kube_pod_info{pod!=""}) '
|
||||||
|
'/ ignoring(node) sum by (namespace) (kube_pod_info{pod!=""}))',
|
||||||
|
{"h": 8, "w": 24, "x": 0, "y": 42},
|
||||||
|
unit="percent",
|
||||||
|
transformations=[
|
||||||
|
{"id": "labelsToFields", "options": {}},
|
||||||
|
{"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-pods",
|
"uid": "atlas-pods",
|
||||||
"title": "Atlas Pods",
|
"title": "Atlas Pods",
|
||||||
|
|||||||
@ -1,27 +0,0 @@
|
|||||||
# services/keycloak
|
|
||||||
|
|
||||||
Keycloak is deployed via raw manifests and backed by the shared Postgres (`postgres-service.postgres.svc.cluster.local:5432`). Create these secrets before applying:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# DB creds (per-service DB/user in shared Postgres)
|
|
||||||
kubectl -n sso create secret generic keycloak-db \
|
|
||||||
--from-literal=username=keycloak \
|
|
||||||
--from-literal=password='<DB_PASSWORD>' \
|
|
||||||
--from-literal=database=keycloak
|
|
||||||
|
|
||||||
# Admin console creds (maps to KC admin user)
|
|
||||||
kubectl -n sso create secret generic keycloak-admin \
|
|
||||||
--from-literal=username=brad@bstein.dev \
|
|
||||||
--from-literal=password='<ADMIN_PASSWORD>'
|
|
||||||
```
|
|
||||||
|
|
||||||
Apply:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl apply -k services/keycloak
|
|
||||||
```
|
|
||||||
|
|
||||||
Notes
|
|
||||||
- Service: `keycloak.sso.svc:80` (Ingress `sso.bstein.dev`, TLS via cert-manager).
|
|
||||||
- Uses Postgres schema `public`; DB/user should be provisioned in the shared Postgres instance.
|
|
||||||
- Health endpoints on :9000 are wired for probes.
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
# services/monitoring
|
|
||||||
|
|
||||||
## Grafana admin secret
|
|
||||||
|
|
||||||
The Grafana Helm release expects a pre-existing secret named `grafana-admin`
|
|
||||||
in the `monitoring` namespace. Create or rotate it with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl create secret generic grafana-admin \
|
|
||||||
--namespace monitoring \
|
|
||||||
--from-literal=admin-user=admin \
|
|
||||||
--from-literal=admin-password='REPLACE_ME'
|
|
||||||
```
|
|
||||||
|
|
||||||
Update the password whenever you rotate credentials.
|
|
||||||
|
|
||||||
## DCGM exporter image
|
|
||||||
|
|
||||||
The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
skopeo copy \
|
|
||||||
--all \
|
|
||||||
docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
|
|
||||||
docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
|
|
||||||
```
|
|
||||||
|
|
||||||
When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.
|
|
||||||
@ -491,6 +491,51 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Namespace Plurality by Node",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 42
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(1, sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / ignoring(node) sum by (namespace) (kube_pod_info{pod!=\"\"}))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "labelsToFields",
|
||||||
|
"options": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": {
|
||||||
|
"fields": [
|
||||||
|
"Value"
|
||||||
|
],
|
||||||
|
"order": "desc"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -500,6 +500,51 @@ data:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Namespace Plurality by Node",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 42
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(1, sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / ignoring(node) sum by (namespace) (kube_pod_info{pod!=\"\"}))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "labelsToFields",
|
||||||
|
"options": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": {
|
||||||
|
"fields": [
|
||||||
|
"Value"
|
||||||
|
],
|
||||||
|
"order": "desc"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user