Compare commits

..

No commits in common. "feature/atlasbot" and "main" have entirely different histories.

76 changed files with 199 additions and 6990 deletions

1
.gitignore vendored
View File

@ -2,7 +2,6 @@
!README.md
!knowledge/**/*.md
!services/comms/knowledge/**/*.md
!services/atlasbot/knowledge/**/*.md
__pycache__/
*.py[cod]
.pytest_cache

View File

@ -1,26 +0,0 @@
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/atlasbot

View File

@ -1,17 +0,0 @@
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: atlasbot
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/atlasbot
targetNamespace: ai
timeout: 2m
dependsOn:
- name: ai-llm

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/atlasbot
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: feature/atlasbot
branch: feature/ariadne
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -1,26 +0,0 @@
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: comms
namespace: comms
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(comms): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/comms

View File

@ -6,9 +6,6 @@ resources:
- vault/kustomization.yaml
- vaultwarden/kustomization.yaml
- comms/kustomization.yaml
- comms/image-automation.yaml
- atlasbot/kustomization.yaml
- atlasbot/image-automation.yaml
- crypto/kustomization.yaml
- monerod/kustomization.yaml
- pegasus/kustomization.yaml

View File

@ -9,7 +9,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: feature/atlasbot
branch: feature/ariadne
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -16,6 +16,5 @@ resources:
- longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml
- nats/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/atlasbot
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: feature/atlasbot
branch: feature/ariadne
update:
strategy: Setters
path: services/maintenance

View File

@ -1,21 +0,0 @@
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: nats
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/nats
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: nats
healthChecks:
- apiVersion: apps/v1
kind: StatefulSet
name: nats
namespace: nats
wait: true

View File

@ -1,3 +0,0 @@
FROM python:3.11-slim
RUN pip install --no-cache-dir psycopg2-binary bcrypt

View File

@ -6,7 +6,6 @@ resources:
- ../modules/profiles/atlas-ha
- coredns-custom.yaml
- coredns-deployment.yaml
- longhorn-node-taints.yaml
- ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -1,40 +0,0 @@
# infrastructure/core/longhorn-node-taints.yaml
apiVersion: v1
kind: Node
metadata:
name: titan-13
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-15
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-17
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-19
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule

View File

@ -1,10 +0,0 @@
# infrastructure/longhorn/core/backup-target.yaml
apiVersion: longhorn.io/v1beta2
kind: BackupTarget
metadata:
name: default
namespace: longhorn-system
spec:
backupTargetURL: "s3://atlas-soteria@us-west-004/"
credentialSecret: longhorn-backup-b2
pollInterval: 5m0s

View File

@ -6,7 +6,6 @@ resources:
- vault-serviceaccount.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- backup-target.yaml
- helmrelease.yaml
- longhorn-settings-ensure-job.yaml

View File

@ -13,27 +13,9 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn_backup__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects:
- secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn_backup__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -1,17 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nats-config
namespace: nats
labels:
app: nats
component: config
annotations:
description: "NATS JetStream configuration"
data:
nats.conf: |
jetstream {
store_dir: /data
max_mem_store: 128MB
max_file_store: 1GB
}

View File

@ -1,7 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- configmap.yaml
- service.yaml
- statefulset.yaml

View File

@ -1,4 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: nats

View File

@ -1,17 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
selector:
app: nats
ports:
- name: client
port: 4222
targetPort: 4222
- name: monitoring
port: 8222
targetPort: 8222

View File

@ -1,54 +0,0 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
serviceName: nats
replicas: 1
selector:
matchLabels:
app: nats
template:
metadata:
labels:
app: nats
spec:
containers:
- name: nats
image: nats:2.10.18
args:
- "-c"
- "/etc/nats/nats.conf"
ports:
- name: client
containerPort: 4222
- name: monitoring
containerPort: 8222
volumeMounts:
- name: config
mountPath: /etc/nats
- name: data
mountPath: /data
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: nats-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi

View File

@ -47,7 +47,6 @@ PERCENT_THRESHOLDS = {
}
NAMESPACE_CPU_WINDOW = "1m"
GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
# ---------------------------------------------------------------------------
# Cluster metadata
@ -236,16 +235,13 @@ def gpu_util_by_hostname():
def gpu_node_labels():
return (
f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
' or kube_node_labels{label_jetson="true"}'
)
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
def gpu_requests_by_namespace_node(scope_var):
return (
"sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})"
")"
@ -257,7 +253,7 @@ def gpu_usage_by_namespace(scope_var):
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})"
")"
)

View File

@ -539,9 +539,9 @@ def main() -> int:
help="Write generated files (otherwise just print a summary).",
)
ap.add_argument(
"--sync-atlasbot",
"--sync-comms",
action="store_true",
help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
)
args = ap.parse_args()
@ -632,10 +632,10 @@ def main() -> int:
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_atlasbot:
atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
_sync_tree(out_dir, atlasbot_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
if args.sync_comms:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
_sync_tree(out_dir, comms_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
return 0

View File

@ -1,26 +0,0 @@
# services/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
name: flux-bot
email: ops@bstein.dev
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
path: services/atlasbot
strategy: Setters

View File

@ -1,23 +0,0 @@
# services/comms/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: atlasbot
namespace: ai
spec:
image: registry.bstein.dev/bstein/atlasbot
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: atlasbot
namespace: ai
spec:
imageRepositoryRef:
name: atlasbot
policy:
semver:
range: ">=0.1.0-0"

View File

@ -1,22 +0,0 @@
Atlas Knowledge Base (KB)
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
Regeneration
- Update manifests/docs, then regenerate generated artifacts:
- `python scripts/knowledge_render_atlas.py --write`
Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.

View File

@ -1,8 +0,0 @@
{
"counts": {
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,234 +0,0 @@
flowchart LR
host_auth_bstein_dev["auth.bstein.dev"]
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
host_auth_bstein_dev --> svc_sso_oauth2_proxy
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
host_bstein_dev["bstein.dev"]
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
host_bstein_dev --> svc_comms_matrix_wellknown
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
wl_comms_element_call["comms/element-call (Deployment)"]
svc_comms_element_call --> wl_comms_element_call
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
host_ci_bstein_dev["ci.bstein.dev"]
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
host_ci_bstein_dev --> svc_jenkins_jenkins
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
svc_jenkins_jenkins --> wl_jenkins_jenkins
host_cloud_bstein_dev["cloud.bstein.dev"]
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
svc_comms_livekit["comms/livekit (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
host_mail_bstein_dev["mail.bstein.dev"]
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
svc_nextcloud_collabora --> wl_nextcloud_collabora
host_pegasus_bstein_dev["pegasus.bstein.dev"]
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
host_scm_bstein_dev["scm.bstein.dev"]
svc_gitea_gitea["gitea/gitea (Service)"]
host_scm_bstein_dev --> svc_gitea_gitea
wl_gitea_gitea["gitea/gitea (Deployment)"]
svc_gitea_gitea --> wl_gitea_gitea
host_secret_bstein_dev["secret.bstein.dev"]
svc_vault_vault["vault/vault (Service)"]
host_secret_bstein_dev --> svc_vault_vault
wl_vault_vault["vault/vault (StatefulSet)"]
svc_vault_vault --> wl_vault_vault
host_sso_bstein_dev["sso.bstein.dev"]
svc_sso_keycloak["sso/keycloak (Service)"]
host_sso_bstein_dev --> svc_sso_keycloak
wl_sso_keycloak["sso/keycloak (Deployment)"]
svc_sso_keycloak --> wl_sso_keycloak
host_stream_bstein_dev["stream.bstein.dev"]
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
subgraph bstein_dev_home[bstein-dev-home]
svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend
svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend
svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway
end
subgraph comms[comms]
svc_comms_matrix_wellknown
wl_comms_matrix_wellknown
svc_comms_element_call
wl_comms_element_call
svc_comms_livekit_token_service
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
svc_jellyfin_jellyfin
wl_jellyfin_jellyfin
end
subgraph jenkins[jenkins]
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
end
subgraph mailu_mailserver[mailu-mailserver]
svc_mailu_mailserver_mailu_front
end
subgraph nextcloud[nextcloud]
svc_nextcloud_nextcloud
wl_nextcloud_nextcloud
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy
svc_sso_keycloak
wl_sso_keycloak
end
subgraph vault[vault]
svc_vault_vault
wl_vault_vault
end
subgraph vaultwarden[vaultwarden]
svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden
end

View File

@ -1,29 +0,0 @@
# services/atlasbot/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- atlasbot-rbac.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- image.yaml
- image-automation.yaml
images:
- name: registry.bstein.dev/bstein/atlasbot
newTag: 0.1.2-97 # {"$imagepolicy": "ai:atlasbot:tag"}
configMapGenerator:
- name: atlasbot-vault-env
files:
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
options:
disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -1,44 +0,0 @@
#!/usr/bin/env sh
set -eu
vault_dir="/vault/secrets"
read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
export PGPASSWORD="$(read_secret synapse-db-pass)"
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"

View File

@ -1,21 +0,0 @@
# services/atlasbot/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: atlasbot-vault
namespace: ai
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "ai"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -1,34 +0,0 @@
# services/atlasbot/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: atlasbot-vault-sync
namespace: ai
spec:
replicas: 1
selector:
matchLabels:
app: atlasbot-vault-sync
template:
metadata:
labels:
app: atlasbot-vault-sync
spec:
serviceAccountName: atlasbot
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: atlasbot-vault

View File

@ -68,11 +68,7 @@ spec:
- name: AI_CHAT_TIMEOUT_SEC
value: "480"
- name: AI_ATLASBOT_ENDPOINT
value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
- name: AI_ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0
- name: AI_ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
- name: AI_ATLASBOT_TIMEOUT_SEC
value: "30"
- name: AI_NODE_NAME

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home

View File

@ -3,7 +3,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: atlasbot
namespace: ai
namespace: comms
labels:
app: atlasbot
spec:
@ -18,7 +18,7 @@ spec:
annotations:
checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "ai"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
vault.hashicorp.com/agent-inject-template-turn-secret: |
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,15 +28,6 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -67,17 +58,17 @@ spec:
hardware: rpi5
containers:
- name: atlasbot
image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
image: python:3.11-slim
command: ["/bin/sh","-c"]
args:
- |
. /vault/scripts/atlasbot_vault_env.sh
exec python -m atlasbot.main
. /vault/scripts/comms_vault_env.sh
exec python /app/bot.py
env:
- name: MATRIX_BASE
value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
value: http://othrys-synapse-matrix-synapse:8008
- name: AUTH_BASE
value: http://matrix-authentication-service.comms.svc.cluster.local:8080
value: http://matrix-authentication-service:8080
- name: KB_DIR
value: /kb
- name: VM_URL
@ -85,61 +76,27 @@ spec:
- name: ARIADNE_STATE_URL
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
- name: BOT_USER
value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
value: atlasbot
- name: BOT_MENTIONS
value: atlas-quick,atlas-smart,atlas-genius
value: atlasbot,aatlasbot,atlas_quick,atlas_smart
- name: OLLAMA_URL
value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL
value: qwen2.5:14b-instruct-q4_0
value: qwen2.5:14b-instruct
- name: ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP
value: qwen2.5:14b-instruct
- name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC
value: "600"
- name: ATLASBOT_THINKING_INTERVAL_SEC
value: "30"
value: "120"
- name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30"
- name: ATLASBOT_HTTP_PORT
value: "8090"
- name: ATLASBOT_STATE_DB
value: /data/atlasbot_state.db
- name: ATLASBOT_QUEUE_ENABLED
value: "false"
- name: ATLASBOT_DEBUG_PIPELINE
value: "true"
- name: ATLASBOT_NATS_URL
value: nats://nats.nats.svc.cluster.local:4222
- name: ATLASBOT_NATS_STREAM
value: atlasbot
- name: ATLASBOT_NATS_SUBJECT
value: atlasbot.requests
- name: ATLASBOT_FAST_MAX_ANGLES
value: "2"
- name: ATLASBOT_SMART_MAX_ANGLES
value: "5"
- name: ATLASBOT_FAST_MAX_CANDIDATES
value: "2"
- name: ATLASBOT_SMART_MAX_CANDIDATES
value: "6"
- name: ATLASBOT_FAST_LLM_CALLS_MAX
value: "24"
- name: ATLASBOT_SMART_LLM_CALLS_MAX
value: "48"
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
value: "96"
ports:
- name: http
containerPort: 8090
@ -151,15 +108,19 @@ spec:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: code
mountPath: /app/bot.py
subPath: bot.py
- name: kb
mountPath: /kb
readOnly: true
- name: vault-scripts
mountPath: /vault/scripts
readOnly: true
- name: atlasbot-state
mountPath: /data
volumes:
- name: code
configMap:
name: atlasbot
- name: kb
configMap:
name: atlas-kb
@ -178,7 +139,5 @@ spec:
path: diagrams/atlas-http.mmd
- name: vault-scripts
configMap:
name: atlasbot-vault-env
name: comms-vault-env
defaultMode: 0555
- name: atlasbot-state
emptyDir: {}

View File

@ -3,9 +3,7 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: atlasbot
namespace: ai
imagePullSecrets:
- name: harbor-regcred
namespace: comms
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
@ -45,4 +43,5 @@ roleRef:
subjects:
- kind: ServiceAccount
name: atlasbot
namespace: ai
namespace: comms

View File

@ -2,7 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: atlasbot
namespace: ai
namespace: comms
labels:
app: atlasbot
spec:

View File

@ -13,7 +13,10 @@ resources:
- element-call-deployment.yaml
- guest-register-deployment.yaml
- guest-register-service.yaml
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- wellknown.yaml
- atlasbot-rbac.yaml
- mas-secrets-ensure-rbac.yaml
- comms-secrets-ensure-rbac.yaml
- mas-db-ensure-rbac.yaml
@ -40,6 +43,7 @@ resources:
- livekit-ingress.yaml
- livekit-middlewares.yaml
- matrix-ingress.yaml
configMapGenerator:
- name: comms-vault-env
files:
@ -56,8 +60,21 @@ configMapGenerator:
- server.py=scripts/guest-register/server.py
options:
disableNameSuffixHash: true
- name: atlasbot
files:
- bot.py=scripts/atlasbot/bot.py
options:
disableNameSuffixHash: true
- name: othrys-element-host-config
files:
- 20-host-config.sh=scripts/element-host-config.sh
options:
disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/comms-secrets-ensure-job.yaml
# One-off job for comms/comms-secrets-ensure-8.
# Purpose: comms secrets ensure 8 (see container args/env in this file).
# One-off job for comms/comms-secrets-ensure-7.
# Purpose: comms secrets ensure 7 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: comms-secrets-ensure-8
name: comms-secrets-ensure-7
namespace: comms
spec:
suspend: true
@ -87,9 +87,6 @@ spec:
ensure_key "comms/synapse-redis" "redis-password" >/dev/null
ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/mas-local-users-ensure-job.yaml
# One-off job for comms/mas-local-users-ensure-19.
# One-off job for comms/mas-local-users-ensure-18.
# Purpose: mas local users ensure 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: mas-local-users-ensure-19
name: mas-local-users-ensure-18
namespace: comms
spec:
suspend: true
@ -27,12 +27,6 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -98,13 +92,7 @@ spec:
- name: SEEDER_USER
value: othrys-seeder
- name: BOT_USER
value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
value: atlasbot
command:
- /bin/sh
- -c
@ -237,27 +225,11 @@ spec:
},
timeout=30,
)
if r.status_code == 429:
return False
if r.status_code != 200:
raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
return True
wait_for_service(MAS_ADMIN_API_BASE)
token = admin_token()
bot_quick = os.environ.get("BOT_USER_QUICK", "")
bot_smart = os.environ.get("BOT_USER_SMART", "")
bot_genius = os.environ.get("BOT_USER_GENIUS", "")
bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
if bot_quick and bot_quick_pass:
ensure_user(token, bot_quick, bot_quick_pass)
if bot_smart and bot_smart_pass:
ensure_user(token, bot_smart, bot_smart_pass)
if bot_genius and bot_genius_pass:
ensure_user(token, bot_genius, bot_genius_pass)
PY

View File

@ -1,15 +1,15 @@
# services/comms/oneoffs/synapse-admin-ensure-job.yaml
# One-off job for comms/synapse-admin-ensure-15.
# Purpose: synapse admin ensure 15 (see container args/env in this file).
# One-off job for comms/synapse-admin-ensure-3.
# Purpose: synapse admin ensure 3 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: synapse-admin-ensure-15
name: synapse-admin-ensure-3
namespace: comms
spec:
suspend: false
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
@ -32,8 +32,7 @@ spec:
values: ["arm64"]
containers:
- name: ensure
image: python:3.12-slim
imagePullPolicy: Always
image: python:3.11-slim
env:
- name: VAULT_ADDR
value: http://vault.vault.svc.cluster.local:8200
@ -46,20 +45,22 @@ spec:
- -c
- |
set -euo pipefail
python -m pip install --no-cache-dir psycopg2-binary
pip install --no-cache-dir psycopg2-binary bcrypt
python - <<'PY'
import json
import os
import secrets
import string
import time
import urllib.error
import urllib.parse
import urllib.request
import bcrypt
import psycopg2
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
PGHOST = "postgres-service.postgres.svc.cluster.local"
PGPORT = 5432
PGDATABASE = "synapse"
@ -112,15 +113,48 @@ spec:
with urllib.request.urlopen(req, timeout=30) as resp:
resp.read()
def random_password(length: int = 32) -> str:
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(length))
def ensure_admin_creds(token: str) -> dict:
data = vault_get(token, "comms/synapse-admin")
username = "othrys-seeder"
if data.get("username") != username:
data["username"] = username
data.pop("access_token", None)
username = (data.get("username") or "").strip() or "synapse-admin"
password = (data.get("password") or "").strip()
if not password:
password = random_password()
data["username"] = username
data["password"] = password
vault_put(token, "comms/synapse-admin", data)
return data
def ensure_user(cur, cols, user_id, password, admin):
now_ms = int(time.time() * 1000)
values = {
"name": user_id,
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
"creation_ts": now_ms,
}
def add_flag(name, flag):
if name not in cols:
return
if cols[name]["type"] in ("smallint", "integer"):
values[name] = int(flag)
else:
values[name] = bool(flag)
add_flag("admin", admin)
add_flag("deactivated", False)
add_flag("shadow_banned", False)
add_flag("is_guest", False)
columns = list(values.keys())
placeholders = ", ".join(["%s"] * len(columns))
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
cur.execute(query, [values[c] for c in columns])
def get_cols(cur):
cur.execute(
"""
@ -138,40 +172,30 @@ spec:
}
return cols
def admin_token_valid(token: str, user_id: str) -> bool:
if not token or not SYNAPSE_ADMIN_URL:
return False
encoded = urllib.parse.quote(user_id, safe="")
url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
resp.read()
return True
except urllib.error.HTTPError as exc:
if exc.code == 404:
return True
if exc.code in (401, 403):
return False
raise
def ensure_access_token(cur, user_id, token_value):
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
token_id = cur.fetchone()[0]
cur.execute(
"""
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
VALUES (%s, %s, %s, %s, NULL)
ON CONFLICT (token) DO NOTHING
""",
(token_id, user_id, token_value, "ariadne-admin"),
)
vault_token = vault_login()
admin_data = ensure_admin_creds(vault_token)
user_id = f"@{admin_data['username']}:live.bstein.dev"
existing_token = admin_data.get("access_token")
if existing_token and admin_token_valid(existing_token, user_id):
log("synapse admin token already present and valid")
if admin_data.get("access_token"):
log("synapse admin token already present")
raise SystemExit(0)
if existing_token:
log("synapse admin token invalid; rotating")
admin_data.pop("access_token", None)
vault_put(vault_token, "comms/synapse-admin", admin_data)
synapse_db = vault_get(vault_token, "comms/synapse-db")
pg_password = synapse_db.get("POSTGRES_PASSWORD")
if not pg_password:
raise RuntimeError("synapse db password missing")
user_id = f"@{admin_data['username']}:live.bstein.dev"
conn = psycopg2.connect(
host=PGHOST,
port=PGPORT,
@ -179,34 +203,17 @@ spec:
user=PGUSER,
password=pg_password,
)
token_value = secrets.token_urlsafe(32)
try:
with conn:
with conn.cursor() as cur:
cols = get_cols(cur)
if "admin" not in cols:
raise RuntimeError("users.admin column missing")
cur.execute(
"UPDATE users SET admin = TRUE WHERE name = %s",
(user_id,),
)
cur.execute(
"""
SELECT token FROM access_tokens
WHERE user_id = %s AND valid_until_ms IS NULL
ORDER BY id DESC LIMIT 1
""",
(user_id,),
)
row = cur.fetchone()
if not row:
raise RuntimeError(f"no access token found for {user_id}")
token_value = row[0]
ensure_user(cur, cols, user_id, admin_data["password"], True)
ensure_access_token(cur, user_id, token_value)
finally:
conn.close()
admin_data["access_token"] = token_value
vault_put(vault_token, "comms/synapse-admin", admin_data)
if not admin_token_valid(token_value, user_id):
raise RuntimeError("synapse admin token validation failed")
log("synapse admin token stored")
PY

View File

@ -82,6 +82,8 @@ spec:
value: synapse
- name: SEEDER_USER
value: othrys-seeder
- name: BOT_USER
value: atlasbot
command:
- /bin/sh
- -c
@ -139,8 +141,10 @@ spec:
cur.execute(query, [values[c] for c in columns])
seeder_user = os.environ["SEEDER_USER"]
bot_user = os.environ["BOT_USER"]
server = "live.bstein.dev"
seeder_id = f"@{seeder_user}:{server}"
bot_id = f"@{bot_user}:{server}"
conn = psycopg2.connect(
host=os.environ["PGHOST"],
@ -154,6 +158,7 @@ spec:
with conn.cursor() as cur:
cols = get_cols(cur)
upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
finally:
conn.close()
PY

View File

@ -76,7 +76,7 @@ spec:
- name: SEEDER_USER
value: othrys-seeder
- name: BOT_USER
value: atlas-smart
value: atlasbot
command:
- /bin/sh
- -c

View File

@ -11,12 +11,8 @@ from urllib import error, parse, request
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
BOT_USER = os.environ["BOT_USER"]
BOT_PASS = os.environ["BOT_PASS"]
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
USER = os.environ["BOT_USER"]
PASSWORD = os.environ["BOT_PASS"]
ROOM_ALIAS = "#othrys:live.bstein.dev"
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
@ -35,7 +31,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -397,31 +393,6 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
return default
def _detect_mode(
content: dict[str, Any],
body: str,
*,
default: str = "deep",
account_user: str = "",
) -> str:
mode = _detect_mode_from_body(body, default=default)
mentions = content.get("m.mentions", {})
user_ids = mentions.get("user_ids", [])
if isinstance(user_ids, list):
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
return "fast"
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
return "deep"
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
return "deep"
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
return "fast"
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
return "deep"
return mode
def _model_for_mode(mode: str) -> str:
if mode == "fast" and MODEL_FAST:
return MODEL_FAST
@ -445,12 +416,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
raw = resp.read()
return json.loads(raw.decode()) if raw else {}
def login(user: str, password: str) -> str:
login_user = normalize_user_id(user)
def login() -> str:
login_user = normalize_user_id(USER)
payload = {
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": login_user},
"password": password,
"password": PASSWORD,
}
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
return res["access_token"]
@ -4849,7 +4820,7 @@ def open_ended_with_thinking(
thread.join(timeout=1)
return result["reply"] or "Model backend is busy. Try again in a moment."
def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
def sync_loop(token: str, room_id: str):
since = None
try:
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4890,7 +4861,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
if not body:
continue
sender = ev.get("sender", "")
if account_user and sender == normalize_user_id(account_user):
if sender == f"@{USER}:live.bstein.dev":
continue
mentioned = is_mentioned(content, body)
@ -4903,12 +4874,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
cleaned_body = _strip_bot_mention(body)
lower_body = cleaned_body.lower()
mode = _detect_mode(
content,
body,
default=default_mode if default_mode in ("fast", "deep") else "deep",
account_user=account_user,
)
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
# Only do live cluster introspection in DMs.
allow_tools = is_dm
@ -4985,65 +4951,26 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
history[hist_key].append(f"Atlas: {reply}")
history[hist_key] = history[hist_key][-80:]
def login_with_retry(user: str, password: str):
def login_with_retry():
last_err = None
for attempt in range(10):
try:
return login(user, password)
return login()
except Exception as exc: # noqa: BLE001
last_err = exc
time.sleep(min(30, 2 ** attempt))
raise last_err
def _bot_accounts() -> list[dict[str, str]]:
accounts: list[dict[str, str]] = []
def add(user: str, password: str, mode: str):
if not user or not password:
return
accounts.append({"user": user, "password": password, "mode": mode})
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "deep")
if BOT_USER_QUICK and BOT_PASS_QUICK:
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
add(BOT_USER, BOT_PASS, "deep")
seen: set[str] = set()
unique: list[dict[str, str]] = []
for acc in accounts:
uid = normalize_user_id(acc["user"]).lower()
if uid in seen:
continue
seen.add(uid)
unique.append(acc)
return unique
def main():
load_kb()
_start_http_server()
accounts = _bot_accounts()
threads: list[threading.Thread] = []
for acc in accounts:
token = login_with_retry(acc["user"], acc["password"])
try:
room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id)
except Exception:
room_id = None
thread = threading.Thread(
target=sync_loop,
args=(token, room_id),
kwargs={
"account_user": acc["user"],
"default_mode": acc["mode"],
},
daemon=True,
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
token = login_with_retry()
try:
room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id)
except Exception:
room_id = None
sync_loop(token, room_id)
if __name__ == "__main__":
main()

View File

@ -7,14 +7,6 @@ read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
@ -22,15 +14,6 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"

View File

@ -66,7 +66,7 @@ spec:
- name: SEEDER_USER
value: othrys-seeder
- name: BOT_USER
value: atlas-smart
value: atlasbot
command:
- /bin/sh
- -c

View File

@ -29,18 +29,12 @@ spec:
operator: In
values: ["rpi4","rpi5"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 80
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
values: ["rpi4"]
containers:
- name: monerod
image: registry.bstein.dev/crypto/monerod:0.18.4.1

View File

@ -23,7 +23,7 @@ spec:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
values: ["rpi4","rpi5"]
containers:
- name: xmrig
image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9

View File

@ -123,22 +123,13 @@ spec:
- key: hardware
operator: In
values: ["rpi4","rpi5"]
- key: longhorn
operator: NotIn
values: ["true"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13","titan-15","titan-17","titan-19"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
values: ["rpi4"]
containers:
- name: gitea
image: gitea/gitea:1.23

View File

@ -245,17 +245,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
extraEnvVars:
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
controller:
image:
repository: registry.bstein.dev/infra/harbor-registryctl
@ -274,10 +263,6 @@ spec:
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
{{ end }}
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -412,10 +397,10 @@ spec:
patch: |-
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/name
value: harbor-core
value: harbor-registry
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/port/number
value: 80
value: 5000
- target:
kind: Deployment
name: harbor-jobservice
@ -479,16 +464,6 @@ spec:
value: /vault/secrets/harbor-registry-env.sh
- name: VAULT_COPY_FILES
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
envFrom:
- $patch: replace
volumeMounts:

View File

@ -67,7 +67,7 @@ data:
url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
credentials('gitea-pat')
}
branches('*/main')
branches('*/master')
}
}
}
@ -108,7 +108,7 @@ data:
url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat')
}
branches('*/main')
branches('*/master')
}
}
scriptPath('Jenkinsfile')
@ -167,58 +167,6 @@ data:
}
}
}
pipelineJob('atlasbot') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/atlasbot.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('Soteria') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/soteria.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('data-prepper') {
properties {
pipelineTriggers {

View File

@ -48,7 +48,7 @@ spec:
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }}
bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
spec:
serviceAccountName: jenkins
nodeSelector:

View File

@ -1,13 +0,0 @@
# services/jenkins/dind-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-dind-cache
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: astreae

View File

@ -8,7 +8,6 @@ resources:
- vault-serviceaccount.yaml
- pvc.yaml
- cache-pvc.yaml
- dind-pvc.yaml
- plugins-pvc.yaml
- configmap-jcasc.yaml
- configmap-plugins.yaml

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-portal-e2e-execute-actions-email-18
name: keycloak-portal-e2e-execute-actions-email-14
namespace: sso
spec:
suspend: true
@ -70,7 +70,7 @@ spec:
- name: E2E_PROBE_USERNAME
value: robotuser
- name: E2E_PROBE_EMAIL
value: brad.stein+robot@gmail.com
value: robotuser@bstein.dev
- name: EXECUTE_ACTIONS_CLIENT_ID
value: bstein-dev-home
- name: EXECUTE_ACTIONS_REDIRECT_URI

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/realm-settings-job.yaml
# One-off job for sso/keycloak-realm-settings-38.
# Purpose: keycloak realm settings 38 (see container args/env in this file).
# One-off job for sso/keycloak-realm-settings-36.
# Purpose: keycloak realm settings 36 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: keycloak-realm-settings-38
name: keycloak-realm-settings-36
namespace: sso
spec:
suspend: true
@ -64,7 +64,7 @@ spec:
- name: KEYCLOAK_REALM
value: atlas
- name: KEYCLOAK_SMTP_HOST
value: smtp.postmarkapp.com
value: mail.bstein.dev
- name: KEYCLOAK_SMTP_PORT
value: "587"
- name: KEYCLOAK_SMTP_FROM

View File

@ -18,7 +18,6 @@ spec:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
maintenance.bstein.dev/restart-rev: "20260207-2"
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -106,7 +105,7 @@ spec:
node-role.kubernetes.io/worker: "true"
containers:
- name: ariadne
image: registry.bstein.dev/bstein/ariadne:latest
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
@ -286,7 +285,7 @@ spec:
- name: ARIADNE_SCHEDULE_MAILU_SYNC
value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
value: "*/15 * * * *"
value: "0 5 * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -294,11 +293,11 @@ spec:
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
value: "0 * * * *"
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
value: "*/15 * * * *"
value: "0 5 * * *"
- name: ARIADNE_SCHEDULE_WGER_ADMIN
value: "15 3 * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
value: "*/15 * * * *"
value: "0 6 * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
value: "0 3 * * *"
- name: ARIADNE_SCHEDULE_POD_CLEANER
@ -306,11 +305,11 @@ spec:
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
value: "23 3 * * *"
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
value: "30 4 * * *"
value: "30 4 * * 0"
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
value: "*/15 * * * *"
value: "0 * * * *"
- name: ARIADNE_SCHEDULE_VAULT_OIDC
value: "*/15 * * * *"
value: "0 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -331,8 +330,6 @@ spec:
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
value: "5"
- name: ARIADNE_ALERTMANAGER_URL
value: http://alertmanager.monitoring.svc.cluster.local
- name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200
- name: OPENSEARCH_LIMIT_BYTES

View File

@ -29,29 +29,6 @@ rules:
- get
- list
- watch
- apiGroups: ["apps"]
resources:
- deployments
- statefulsets
- daemonsets
verbs:
- get
- list
- watch
- apiGroups: ["longhorn.io"]
resources:
- volumes
verbs:
- get
- list
- watch
- apiGroups: [""]
resources:
- events
verbs:
- get
- list
- watch
- apiGroups: [""]
resources:
- pods/exec
@ -79,17 +56,3 @@ roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ariadne-job-spawner
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: ariadne-auth-delegator
subjects:
- kind: ServiceAccount
name: ariadne
namespace: maintenance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator

View File

@ -21,26 +21,3 @@ spec:
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: soteria
namespace: maintenance
spec:
image: registry.bstein.dev/bstein/soteria
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: soteria
namespace: maintenance
spec:
imageRepositoryRef:
name: soteria
policy:
semver:
range: ">=0.1.0-0"

View File

@ -5,7 +5,6 @@ resources:
- namespace.yaml
- image.yaml
- secretproviderclass.yaml
- soteria-configmap.yaml
- vault-serviceaccount.yaml
- vault-sync-deployment.yaml
- ariadne-serviceaccount.yaml
@ -14,12 +13,9 @@ resources:
- k3s-traefik-cleanup-rbac.yaml
- node-nofile-serviceaccount.yaml
- pod-cleaner-rbac.yaml
- soteria-serviceaccount.yaml
- soteria-rbac.yaml
- ariadne-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml
- ariadne-service.yaml
- soteria-deployment.yaml
- disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml
@ -28,12 +24,9 @@ resources:
- node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml
- soteria-service.yaml
images:
- name: registry.bstein.dev/bstein/ariadne
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
configMapGenerator:
- name: disable-k3s-traefik-script
namespace: maintenance

View File

@ -1,10 +0,0 @@
# services/maintenance/soteria-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: "longhorn"
SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
SOTERIA_LONGHORN_BACKUP_MODE: "incremental"

View File

@ -1,73 +0,0 @@
# services/maintenance/soteria-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: soteria
namespace: maintenance
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: soteria
template:
metadata:
labels:
app: soteria
spec:
serviceAccountName: soteria
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: soteria
image: registry.bstein.dev/bstein/soteria:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
envFrom:
- configMapRef:
name: soteria
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
periodSeconds: 5
timeoutSeconds: 2
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 65532
capabilities:
drop: ["ALL"]

View File

@ -1,22 +0,0 @@
# services/maintenance/soteria-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: soteria
rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims", "persistentvolumes"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: soteria
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: soteria
subjects:
- kind: ServiceAccount
name: soteria
namespace: maintenance

View File

@ -1,14 +0,0 @@
# services/maintenance/soteria-service.yaml
apiVersion: v1
kind: Service
metadata:
name: soteria
namespace: maintenance
spec:
type: ClusterIP
selector:
app: soteria
ports:
- name: http
port: 80
targetPort: http

View File

@ -1,8 +0,0 @@
# services/maintenance/soteria-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: soteria
namespace: maintenance
imagePullSecrets:
- name: harbor-regcred

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -89,7 +89,7 @@
},
"targets": [
{
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A",
"legendFormat": "{{namespace}}"
}

View File

@ -1901,7 +1901,7 @@
},
"targets": [
{
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}

View File

@ -145,7 +145,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
legendFormat: '{{instance}}'
datasource:
type: prometheus
@ -286,56 +286,8 @@ data:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: maint-ariadne-image-sweeper-stale
title: "Ariadne image sweeper stale (schedule >8d)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [691200]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne image sweeper stale >8d since last success"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (legacy disabled)"
title: "Maintenance CronJobs stale (>3h since success)"
condition: C
for: "5m"
data:
@ -345,10 +297,10 @@ data:
to: 0
datasourceUid: atlas-vm
model:
expr: vector(0)
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: legacy
legendFormat: '{{cronjob}}'
datasource:
type: prometheus
uid: atlas-vm
@ -369,118 +321,17 @@ data:
type: threshold
conditions:
- evaluator:
params: [1]
params: [10800]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: OK
annotations:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
noDataState: NoData
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
summary: "Maintenance cronjob stale >3h since last success"
labels:
severity: warning
- orgId: 1
@ -501,7 +352,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: postmark_outbound_bounce_rate{window="1d"}
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
legendFormat: bounce 1d
datasource:
type: prometheus
@ -549,7 +400,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
expr: POSTMARK_API_UP
legendFormat: api up
datasource:
type: prometheus

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -98,7 +98,7 @@ data:
},
"targets": [
{
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A",
"legendFormat": "{{namespace}}"
}

View File

@ -1910,7 +1910,7 @@ data:
},
"targets": [
{
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}

View File

@ -286,7 +286,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "4"
monitoring.bstein.dev/restart-rev: "1"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}

View File

@ -43,12 +43,6 @@ spec:
value: /var/run/secrets/vault-token-reviewer/token
- name: VAULT_K8S_ROLE_TTL
value: 1h
- name: VAULT_K8S_BOUND_AUDIENCES
value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
- name: VAULT_K8S_ISSUER
value: https://kubernetes.default.svc.cluster.local
- name: VAULT_K8S_DISABLE_ISS_VALIDATION
value: "false"
volumeMounts:
- name: k8s-auth-config-script
mountPath: /scripts

View File

@ -53,8 +53,6 @@ ensure_token
k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
k8s_issuer="${VAULT_K8S_ISSUER:-}"
disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
@ -70,36 +68,11 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
vault_cmd auth enable kubernetes
fi
ensure_default_policy_login() {
default_policy="$(vault_cmd policy read default)"
if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
return
fi
log "updating default policy to allow kubernetes login"
default_policy="${default_policy}
path \"auth/kubernetes/login\" {
capabilities = [\"create\", \"update\"]
}
"
printf '%s\n' "${default_policy}" | vault_cmd policy write default -
}
log "configuring kubernetes auth"
if [ -n "${k8s_issuer}" ]; then
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}" \
issuer="${k8s_issuer}" \
disable_iss_validation="${disable_iss_validation}"
else
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}"
fi
ensure_default_policy_login
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}"
write_raw_policy() {
name="$1"
@ -114,7 +87,6 @@ write_policy_and_role() {
service_accounts="$3"
read_paths="$4"
write_paths="$5"
audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
policy_body=""
for path in ${read_paths}; do
@ -137,42 +109,11 @@ path \"kv/metadata/atlas/${path}\" {
}
"
done
if [ "${role}" = "maintenance" ]; then
policy_body="${policy_body}
path \"sys/auth\" {
capabilities = [\"read\"]
}
path \"sys/auth/*\" {
capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
}
path \"auth/kubernetes/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"auth/oidc/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"sys/policies/acl\" {
capabilities = [\"list\"]
}
path \"sys/policies/acl/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
"
fi
log "writing policy ${role}"
printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
log "writing role ${role}"
if [ -n "${audiences}" ]; then
vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_audiences="${audiences}" \
bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \
policies="${role}" \
ttl="${role_ttl}"
return
fi
vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \
@ -277,8 +218,6 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
"nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "ai" "ai" "atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
"jenkins/* shared/harbor-pull" ""
write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -292,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
write_policy_and_role "health" "health" "health-vault-sync" \
"health/*" ""
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
"maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
"maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
write_policy_and_role "finance" "finance" "finance-vault" \
"finance/* shared/postmark-relay" ""
write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \