Compare commits
No commits in common. "main" and "feature/postgres-migration" have entirely different histories.
main
...
feature/po
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,5 +6,3 @@ __pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache
|
||||
.venv
|
||||
.venv-ci
|
||||
tmp/
|
||||
|
||||
77
Jenkinsfile
vendored
77
Jenkinsfile
vendored
@ -1,77 +0,0 @@
|
||||
// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes {
|
||||
defaultContainer 'python'
|
||||
yaml """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
spec:
|
||||
nodeSelector:
|
||||
hardware: rpi5
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: python
|
||||
image: python:3.12-slim
|
||||
command:
|
||||
- cat
|
||||
tty: true
|
||||
"""
|
||||
}
|
||||
}
|
||||
environment {
|
||||
PIP_DISABLE_PIP_VERSION_CHECK = '1'
|
||||
PYTHONUNBUFFERED = '1'
|
||||
}
|
||||
stages {
|
||||
stage('Checkout') {
|
||||
steps {
|
||||
checkout scm
|
||||
}
|
||||
}
|
||||
stage('Install deps') {
|
||||
steps {
|
||||
sh 'pip install --no-cache-dir -r ci/requirements.txt'
|
||||
}
|
||||
}
|
||||
stage('Glue tests') {
|
||||
steps {
|
||||
sh 'pytest -q ci/tests/glue'
|
||||
}
|
||||
}
|
||||
stage('Resolve Flux branch') {
|
||||
steps {
|
||||
script {
|
||||
env.FLUX_BRANCH = sh(
|
||||
returnStdout: true,
|
||||
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
|
||||
).trim()
|
||||
if (!env.FLUX_BRANCH) {
|
||||
error('Flux branch not found in gotk-sync.yaml')
|
||||
}
|
||||
echo "Flux branch: ${env.FLUX_BRANCH}"
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Promote') {
|
||||
when {
|
||||
expression {
|
||||
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
|
||||
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
|
||||
}
|
||||
}
|
||||
steps {
|
||||
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
|
||||
sh '''
|
||||
set +x
|
||||
git config user.email "jenkins@bstein.dev"
|
||||
git config user.name "jenkins"
|
||||
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
|
||||
git push origin HEAD:${FLUX_BRANCH}
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,76 +0,0 @@
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes {
|
||||
defaultContainer 'python'
|
||||
yaml """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
spec:
|
||||
nodeSelector:
|
||||
hardware: rpi5
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: python
|
||||
image: python:3.12-slim
|
||||
command:
|
||||
- cat
|
||||
tty: true
|
||||
"""
|
||||
}
|
||||
}
|
||||
environment {
|
||||
PIP_DISABLE_PIP_VERSION_CHECK = '1'
|
||||
PYTHONUNBUFFERED = '1'
|
||||
}
|
||||
stages {
|
||||
stage('Checkout') {
|
||||
steps {
|
||||
checkout scm
|
||||
}
|
||||
}
|
||||
stage('Install deps') {
|
||||
steps {
|
||||
sh 'pip install --no-cache-dir -r ci/requirements.txt'
|
||||
}
|
||||
}
|
||||
stage('Glue tests') {
|
||||
steps {
|
||||
sh 'pytest -q ci/tests/glue'
|
||||
}
|
||||
}
|
||||
stage('Resolve Flux branch') {
|
||||
steps {
|
||||
script {
|
||||
env.FLUX_BRANCH = sh(
|
||||
returnStdout: true,
|
||||
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
|
||||
).trim()
|
||||
if (!env.FLUX_BRANCH) {
|
||||
error('Flux branch not found in gotk-sync.yaml')
|
||||
}
|
||||
echo "Flux branch: ${env.FLUX_BRANCH}"
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Promote') {
|
||||
when {
|
||||
expression {
|
||||
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
|
||||
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
|
||||
}
|
||||
}
|
||||
steps {
|
||||
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
|
||||
sh '''
|
||||
set +x
|
||||
git config user.email "jenkins@bstein.dev"
|
||||
git config user.name "jenkins"
|
||||
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
|
||||
git push origin HEAD:${FLUX_BRANCH}
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,4 +0,0 @@
|
||||
pytest==8.3.4
|
||||
kubernetes==30.1.0
|
||||
PyYAML==6.0.2
|
||||
requests==2.32.3
|
||||
@ -1,16 +0,0 @@
|
||||
max_success_age_hours: 48
|
||||
allow_suspended:
|
||||
- bstein-dev-home/vaultwarden-cred-sync
|
||||
- comms/othrys-room-reset
|
||||
- comms/pin-othrys-invite
|
||||
- comms/seed-othrys-room
|
||||
- finance/firefly-user-sync
|
||||
- health/wger-admin-ensure
|
||||
- health/wger-user-sync
|
||||
- mailu-mailserver/mailu-sync-nightly
|
||||
- nextcloud/nextcloud-mail-sync
|
||||
ariadne_schedule_tasks:
|
||||
- schedule.mailu_sync
|
||||
- schedule.nextcloud_sync
|
||||
- schedule.vaultwarden_sync
|
||||
- schedule.wger_admin
|
||||
@ -1,46 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from kubernetes import client, config
|
||||
|
||||
|
||||
CONFIG_PATH = Path(__file__).with_name("config.yaml")
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
|
||||
return yaml.safe_load(handle) or {}
|
||||
|
||||
|
||||
def _load_kube():
|
||||
try:
|
||||
config.load_incluster_config()
|
||||
except config.ConfigException:
|
||||
config.load_kube_config()
|
||||
|
||||
|
||||
def test_glue_cronjobs_recent_success():
|
||||
cfg = _load_config()
|
||||
max_age_hours = int(cfg.get("max_success_age_hours", 48))
|
||||
allow_suspended = set(cfg.get("allow_suspended", []))
|
||||
|
||||
_load_kube()
|
||||
batch = client.BatchV1Api()
|
||||
cronjobs = batch.list_cron_job_for_all_namespaces(label_selector="atlas.bstein.dev/glue=true").items
|
||||
|
||||
assert cronjobs, "No glue cronjobs found with atlas.bstein.dev/glue=true"
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
for cronjob in cronjobs:
|
||||
name = f"{cronjob.metadata.namespace}/{cronjob.metadata.name}"
|
||||
if cronjob.spec.suspend:
|
||||
assert name in allow_suspended, f"{name} is suspended but not in allow_suspended"
|
||||
continue
|
||||
|
||||
last_success = cronjob.status.last_successful_time
|
||||
assert last_success is not None, f"{name} has no lastSuccessfulTime"
|
||||
age_hours = (now - last_success).total_seconds() / 3600
|
||||
assert age_hours <= max_age_hours, f"{name} last success {age_hours:.1f}h ago"
|
||||
@ -1,48 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
|
||||
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
|
||||
CONFIG_PATH = Path(__file__).with_name("config.yaml")
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
|
||||
return yaml.safe_load(handle) or {}
|
||||
|
||||
|
||||
def _query(promql: str) -> list[dict]:
|
||||
response = requests.get(f"{VM_URL}/api/v1/query", params={"query": promql}, timeout=10)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
return payload.get("data", {}).get("result", [])
|
||||
|
||||
|
||||
def test_glue_metrics_present():
|
||||
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
|
||||
assert series, "No glue cronjob label series found"
|
||||
|
||||
|
||||
def test_glue_metrics_success_join():
|
||||
query = (
|
||||
"kube_cronjob_status_last_successful_time "
|
||||
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
|
||||
)
|
||||
series = _query(query)
|
||||
assert series, "No glue cronjob last success series found"
|
||||
|
||||
|
||||
def test_ariadne_schedule_metrics_present():
|
||||
cfg = _load_config()
|
||||
expected = cfg.get("ariadne_schedule_tasks", [])
|
||||
if not expected:
|
||||
return
|
||||
series = _query("ariadne_schedule_next_run_timestamp_seconds")
|
||||
tasks = {item.get("metric", {}).get("task") for item in series}
|
||||
missing = [task for task in expected if task not in tasks]
|
||||
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
|
||||
13
clusters/atlas/applications/kustomization.yaml
Normal file
13
clusters/atlas/applications/kustomization.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# clusters/atlas/applications/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../../services/crypto
|
||||
- ../../services/gitea
|
||||
- ../../services/jellyfin
|
||||
- ../../services/comms
|
||||
- ../../services/monitoring
|
||||
- ../../services/logging
|
||||
- ../../services/pegasus
|
||||
- ../../services/vault
|
||||
- ../../services/bstein-dev-home
|
||||
@ -1,17 +0,0 @@
|
||||
# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: bstein-dev-home-migrations
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/bstein-dev-home/oneoffs/migrations
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: bstein-dev-home
|
||||
wait: false
|
||||
suspend: true
|
||||
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: bstein-dev-home
|
||||
namespace: bstein-dev-home
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
@ -13,14 +13,14 @@ spec:
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: main
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(bstein-dev-home): automated image update"
|
||||
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
branch: main
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/bstein-dev-home
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# clusters/atlas/flux-system/applications/comms/kustomization.yaml
|
||||
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
# clusters/atlas/flux-system/applications/finance/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: finance
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/finance
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: finance
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: actual-budget
|
||||
namespace: finance
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: firefly
|
||||
namespace: finance
|
||||
wait: false
|
||||
@ -13,6 +13,11 @@ spec:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
healthChecks:
|
||||
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
name: harbor
|
||||
namespace: harbor
|
||||
wait: false
|
||||
dependsOn:
|
||||
- name: core
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
# clusters/atlas/flux-system/applications/health/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: health
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/health
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: health
|
||||
dependsOn:
|
||||
- name: keycloak
|
||||
- name: postgres
|
||||
- name: traefik
|
||||
- name: vault
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: wger
|
||||
namespace: health
|
||||
wait: false
|
||||
@ -12,12 +12,10 @@ resources:
|
||||
- pegasus/image-automation.yaml
|
||||
- bstein-dev-home/kustomization.yaml
|
||||
- bstein-dev-home/image-automation.yaml
|
||||
- bstein-dev-home-migrations/kustomization.yaml
|
||||
- harbor/kustomization.yaml
|
||||
- harbor/image-automation.yaml
|
||||
- jellyfin/kustomization.yaml
|
||||
- xmr-miner/kustomization.yaml
|
||||
- wallet-monero-temp/kustomization.yaml
|
||||
- sui-metrics/kustomization.yaml
|
||||
- openldap/kustomization.yaml
|
||||
- keycloak/kustomization.yaml
|
||||
@ -29,5 +27,3 @@ resources:
|
||||
- nextcloud-mail-sync/kustomization.yaml
|
||||
- outline/kustomization.yaml
|
||||
- planka/kustomization.yaml
|
||||
- finance/kustomization.yaml
|
||||
- health/kustomization.yaml
|
||||
|
||||
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: pegasus
|
||||
namespace: jellyfin
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
# clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: wallet-monero-temp
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./services/crypto/wallet-monero-temp
|
||||
targetNamespace: crypto
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: crypto
|
||||
- name: xmr-miner
|
||||
wait: true
|
||||
@ -1,4 +1,3 @@
|
||||
# clusters/atlas/flux-system/gotk-components.yaml
|
||||
---
|
||||
# This manifest was generated by flux. DO NOT EDIT.
|
||||
# Flux Version: v2.7.5
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# clusters/atlas/flux-system/gotk-sync.yaml
|
||||
# This manifest was generated by flux. DO NOT EDIT.
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
@ -9,7 +8,7 @@ metadata:
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/sso-hardening
|
||||
secretRef:
|
||||
name: flux-system-gitea
|
||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: cert-manager-cleanup
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/cert-manager/cleanup
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
targetNamespace: cert-manager
|
||||
wait: true
|
||||
@ -1,19 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: cert-manager
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/cert-manager
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
targetNamespace: cert-manager
|
||||
dependsOn:
|
||||
- name: helm
|
||||
wait: true
|
||||
@ -4,17 +4,12 @@ kind: Kustomization
|
||||
resources:
|
||||
- core/kustomization.yaml
|
||||
- helm/kustomization.yaml
|
||||
- cert-manager/kustomization.yaml
|
||||
- metallb/kustomization.yaml
|
||||
- traefik/kustomization.yaml
|
||||
- gitops-ui/kustomization.yaml
|
||||
- monitoring/kustomization.yaml
|
||||
- logging/kustomization.yaml
|
||||
- maintenance/kustomization.yaml
|
||||
- maintenance/image-automation.yaml
|
||||
- longhorn-adopt/kustomization.yaml
|
||||
- longhorn/kustomization.yaml
|
||||
- longhorn-ui/kustomization.yaml
|
||||
- postgres/kustomization.yaml
|
||||
- ../platform/vault-csi/kustomization.yaml
|
||||
- ../platform/vault-injector/kustomization.yaml
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: longhorn-adopt
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/longhorn/adopt
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
targetNamespace: longhorn-system
|
||||
wait: true
|
||||
@ -15,5 +15,4 @@ spec:
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: core
|
||||
- name: longhorn
|
||||
wait: true
|
||||
|
||||
@ -1,20 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: longhorn
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/longhorn/core
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
targetNamespace: longhorn-system
|
||||
dependsOn:
|
||||
- name: helm
|
||||
- name: longhorn-adopt
|
||||
wait: false
|
||||
@ -1,26 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: maintenance
|
||||
namespace: maintenance
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(maintenance): automated image update"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/maintenance
|
||||
@ -8,7 +8,6 @@ spec:
|
||||
interval: 10m
|
||||
path: ./services/maintenance
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
# clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: vault-injector
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/vault-injector
|
||||
targetNamespace: vault
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
wait: true
|
||||
8
clusters/atlas/platform/kustomization.yaml
Normal file
8
clusters/atlas/platform/kustomization.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
# clusters/atlas/platform/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../../../infrastructure/modules/base
|
||||
- ../../../infrastructure/modules/profiles/atlas-ha
|
||||
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
|
||||
- ../../../infrastructure/metallb
|
||||
@ -1,5 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
|
||||
RUN pip install --no-cache-dir requests psycopg2-binary
|
||||
@ -1,9 +0,0 @@
|
||||
FROM registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
|
||||
|
||||
USER root
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
USER harbor
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/harbor/entrypoint.sh"]
|
||||
@ -1,9 +0,0 @@
|
||||
FROM registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
|
||||
|
||||
USER root
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
USER harbor
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/harbor/entrypoint.sh"]
|
||||
@ -1,9 +0,0 @@
|
||||
FROM registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
|
||||
|
||||
USER root
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
USER harbor
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/home/harbor/entrypoint.sh"]
|
||||
@ -1,9 +0,0 @@
|
||||
FROM registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
|
||||
|
||||
USER root
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
USER harbor
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/home/harbor/start.sh"]
|
||||
@ -1,10 +0,0 @@
|
||||
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates
|
||||
COPY --from=base /lk-jwt-service /lk-jwt-service
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/lk-jwt-service"]
|
||||
@ -1,10 +0,0 @@
|
||||
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates
|
||||
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/bin/oauth2-proxy"]
|
||||
@ -1,10 +0,0 @@
|
||||
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates
|
||||
COPY --from=base /pegasus /pegasus
|
||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
||||
RUN chmod 0755 /entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["/pegasus"]
|
||||
@ -1,34 +0,0 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
if [ -n "${VAULT_ENV_FILE:-}" ]; then
|
||||
if [ -f "${VAULT_ENV_FILE}" ]; then
|
||||
# shellcheck disable=SC1090
|
||||
. "${VAULT_ENV_FILE}"
|
||||
else
|
||||
echo "Vault env file not found: ${VAULT_ENV_FILE}" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "${VAULT_COPY_FILES:-}" ]; then
|
||||
old_ifs="$IFS"
|
||||
IFS=','
|
||||
for pair in ${VAULT_COPY_FILES}; do
|
||||
src="${pair%%:*}"
|
||||
dest="${pair#*:}"
|
||||
if [ -z "${src}" ] || [ -z "${dest}" ]; then
|
||||
echo "Vault copy entry malformed: ${pair}" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f "${src}" ]; then
|
||||
echo "Vault file not found: ${src}" >&2
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p "$(dirname "${dest}")"
|
||||
cp "${src}" "${dest}"
|
||||
done
|
||||
IFS="$old_ifs"
|
||||
fi
|
||||
|
||||
exec "$@"
|
||||
@ -1,40 +0,0 @@
|
||||
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: cert-manager-cleanup-2
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: cert-manager-cleanup
|
||||
restartPolicy: Never
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/worker
|
||||
operator: Exists
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values: ["arm64"]
|
||||
containers:
|
||||
- name: cleanup
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
command: ["/usr/bin/env", "bash"]
|
||||
args: ["/scripts/cert_manager_cleanup.sh"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: cert-manager-cleanup-script
|
||||
defaultMode: 0555
|
||||
@ -1,58 +0,0 @@
|
||||
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cert-manager-cleanup
|
||||
namespace: cert-manager
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: cert-manager-cleanup
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods
|
||||
- services
|
||||
- endpoints
|
||||
- configmaps
|
||||
- secrets
|
||||
- serviceaccounts
|
||||
verbs: ["get", "list", "watch", "delete"]
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- deployments
|
||||
- daemonsets
|
||||
- statefulsets
|
||||
- replicasets
|
||||
verbs: ["get", "list", "watch", "delete"]
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- jobs
|
||||
- cronjobs
|
||||
verbs: ["get", "list", "watch", "delete"]
|
||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
||||
resources:
|
||||
- roles
|
||||
- rolebindings
|
||||
- clusterroles
|
||||
- clusterrolebindings
|
||||
verbs: ["get", "list", "watch", "delete"]
|
||||
- apiGroups: ["admissionregistration.k8s.io"]
|
||||
resources:
|
||||
- validatingwebhookconfigurations
|
||||
- mutatingwebhookconfigurations
|
||||
verbs: ["get", "list", "watch", "delete"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: cert-manager-cleanup
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cert-manager-cleanup
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cert-manager-cleanup
|
||||
namespace: cert-manager
|
||||
@ -1,15 +0,0 @@
|
||||
# infrastructure/cert-manager/cleanup/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- cert-manager-cleanup-rbac.yaml
|
||||
- cert-manager-cleanup-job.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: cert-manager-cleanup-script
|
||||
namespace: cert-manager
|
||||
files:
|
||||
- cert_manager_cleanup.sh=scripts/cert_manager_cleanup.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
@ -1,5 +0,0 @@
|
||||
# infrastructure/cert-manager/cleanup/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
@ -1,37 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
namespace="cert-manager"
|
||||
selectors=(
|
||||
"app.kubernetes.io/name=cert-manager"
|
||||
"app.kubernetes.io/instance=cert-manager"
|
||||
"app.kubernetes.io/instance=certmanager-prod"
|
||||
)
|
||||
|
||||
delete_namespaced() {
|
||||
local selector="$1"
|
||||
kubectl -n "${namespace}" delete deployment,daemonset,statefulset,replicaset \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
kubectl -n "${namespace}" delete pod,service,endpoints,serviceaccount,configmap,secret \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
kubectl -n "${namespace}" delete role,rolebinding \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
kubectl -n "${namespace}" delete job,cronjob \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
}
|
||||
|
||||
delete_cluster_scoped() {
|
||||
local selector="$1"
|
||||
kubectl delete clusterrole,clusterrolebinding \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
kubectl delete mutatingwebhookconfiguration,validatingwebhookconfiguration \
|
||||
--selector "${selector}" --ignore-not-found --wait=false
|
||||
}
|
||||
|
||||
for selector in "${selectors[@]}"; do
|
||||
delete_namespaced "${selector}"
|
||||
delete_cluster_scoped "${selector}"
|
||||
done
|
||||
|
||||
kubectl delete mutatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
|
||||
kubectl delete validatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
|
||||
@ -1,67 +0,0 @@
|
||||
# infrastructure/cert-manager/helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: cert-manager
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
interval: 30m
|
||||
chart:
|
||||
spec:
|
||||
chart: cert-manager
|
||||
version: v1.17.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: jetstack
|
||||
namespace: flux-system
|
||||
install:
|
||||
crds: CreateReplace
|
||||
remediation: { retries: 3 }
|
||||
timeout: 10m
|
||||
upgrade:
|
||||
crds: CreateReplace
|
||||
remediation:
|
||||
retries: 3
|
||||
remediateLastFailure: true
|
||||
cleanupOnFail: true
|
||||
timeout: 10m
|
||||
values:
|
||||
installCRDs: true
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
webhook:
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
cainjector:
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
@ -1,6 +0,0 @@
|
||||
# infrastructure/cert-manager/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrelease.yaml
|
||||
@ -1,5 +0,0 @@
|
||||
# infrastructure/cert-manager/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
@ -1,47 +0,0 @@
|
||||
# infrastructure/core/coredns-custom.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: coredns-custom
|
||||
namespace: kube-system
|
||||
data:
|
||||
bstein-dev.server: |
|
||||
bstein.dev:53 {
|
||||
errors
|
||||
cache 30
|
||||
hosts {
|
||||
192.168.22.9 alerts.bstein.dev
|
||||
192.168.22.9 auth.bstein.dev
|
||||
192.168.22.9 bstein.dev
|
||||
10.43.6.87 budget.bstein.dev
|
||||
192.168.22.9 call.live.bstein.dev
|
||||
192.168.22.9 cd.bstein.dev
|
||||
192.168.22.9 chat.ai.bstein.dev
|
||||
192.168.22.9 ci.bstein.dev
|
||||
192.168.22.9 cloud.bstein.dev
|
||||
192.168.22.9 health.bstein.dev
|
||||
192.168.22.9 kit.live.bstein.dev
|
||||
192.168.22.9 live.bstein.dev
|
||||
192.168.22.9 logs.bstein.dev
|
||||
192.168.22.9 longhorn.bstein.dev
|
||||
192.168.22.4 mail.bstein.dev
|
||||
192.168.22.9 matrix.live.bstein.dev
|
||||
192.168.22.9 metrics.bstein.dev
|
||||
192.168.22.9 monero.bstein.dev
|
||||
10.43.6.87 money.bstein.dev
|
||||
192.168.22.9 notes.bstein.dev
|
||||
192.168.22.9 office.bstein.dev
|
||||
192.168.22.9 pegasus.bstein.dev
|
||||
3.136.224.193 pm-bounces.bstein.dev
|
||||
3.150.68.49 pm-bounces.bstein.dev
|
||||
18.189.137.81 pm-bounces.bstein.dev
|
||||
192.168.22.9 registry.bstein.dev
|
||||
192.168.22.9 scm.bstein.dev
|
||||
192.168.22.9 secret.bstein.dev
|
||||
192.168.22.9 sso.bstein.dev
|
||||
192.168.22.9 stream.bstein.dev
|
||||
192.168.22.9 tasks.bstein.dev
|
||||
192.168.22.9 vault.bstein.dev
|
||||
fallthrough
|
||||
}
|
||||
}
|
||||
@ -1,141 +0,0 @@
|
||||
# infrastructure/core/coredns-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: coredns
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: kube-dns
|
||||
kubernetes.io/name: CoreDNS
|
||||
spec:
|
||||
progressDeadlineSeconds: 600
|
||||
replicas: 2
|
||||
revisionHistoryLimit: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 25%
|
||||
maxUnavailable: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: kube-dns
|
||||
spec:
|
||||
containers:
|
||||
- name: coredns
|
||||
image: registry.bstein.dev/infra/coredns:1.12.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- -conf
|
||||
- /etc/coredns/Corefile
|
||||
ports:
|
||||
- containerPort: 53
|
||||
name: dns
|
||||
protocol: UDP
|
||||
- containerPort: 53
|
||||
name: dns-tcp
|
||||
protocol: TCP
|
||||
- containerPort: 9153
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8080
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 1
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 8181
|
||||
scheme: HTTP
|
||||
periodSeconds: 2
|
||||
timeoutSeconds: 1
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
resources:
|
||||
limits:
|
||||
memory: 170Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 70Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
add:
|
||||
- NET_BIND_SERVICE
|
||||
drop:
|
||||
- all
|
||||
readOnlyRootFilesystem: true
|
||||
volumeMounts:
|
||||
- name: config-volume
|
||||
mountPath: /etc/coredns
|
||||
readOnly: true
|
||||
- name: custom-config-volume
|
||||
mountPath: /etc/coredns/custom
|
||||
readOnly: true
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
- key: node-role.kubernetes.io/worker
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
dnsPolicy: Default
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
priorityClassName: system-cluster-critical
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
serviceAccountName: coredns
|
||||
tolerations:
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: kubernetes.io/hostname
|
||||
whenUnsatisfiable: DoNotSchedule
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
- maxSkew: 1
|
||||
topologyKey: topology.kubernetes.io/zone
|
||||
whenUnsatisfiable: ScheduleAnyway
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
volumes:
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: coredns
|
||||
defaultMode: 420
|
||||
items:
|
||||
- key: Corefile
|
||||
path: Corefile
|
||||
- key: NodeHosts
|
||||
path: NodeHosts
|
||||
- name: custom-config-volume
|
||||
configMap:
|
||||
name: coredns-custom
|
||||
optional: true
|
||||
defaultMode: 420
|
||||
@ -4,8 +4,5 @@ kind: Kustomization
|
||||
resources:
|
||||
- ../modules/base
|
||||
- ../modules/profiles/atlas-ha
|
||||
- coredns-custom.yaml
|
||||
- coredns-deployment.yaml
|
||||
- ntp-sync-daemonset.yaml
|
||||
- ../sources/cert-manager/letsencrypt.yaml
|
||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||
|
||||
@ -1,50 +0,0 @@
|
||||
# infrastructure/core/ntp-sync-daemonset.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: ntp-sync
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: ntp-sync
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: ntp-sync
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ntp-sync
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: DoesNotExist
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: DoesNotExist
|
||||
containers:
|
||||
- name: ntp-sync
|
||||
image: public.ecr.aws/docker/library/busybox:1.36.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
while true; do
|
||||
ntpd -q -p pool.ntp.org || true
|
||||
sleep 300
|
||||
done
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["SYS_TIME"]
|
||||
runAsUser: 0
|
||||
runAsGroup: 0
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 16Mi
|
||||
limits:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
@ -1,15 +0,0 @@
|
||||
# infrastructure/longhorn/adopt/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- longhorn-adopt-rbac.yaml
|
||||
- longhorn-helm-adopt-job.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: longhorn-helm-adopt-script
|
||||
namespace: longhorn-system
|
||||
files:
|
||||
- longhorn_helm_adopt.sh=scripts/longhorn_helm_adopt.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
@ -1,56 +0,0 @@
|
||||
# infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: longhorn-helm-adopt
|
||||
namespace: longhorn-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: longhorn-helm-adopt
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
- services
|
||||
- serviceaccounts
|
||||
- secrets
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- deployments
|
||||
- daemonsets
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- jobs
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
||||
resources:
|
||||
- roles
|
||||
- rolebindings
|
||||
- clusterroles
|
||||
- clusterrolebindings
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
- apiGroups: ["apiextensions.k8s.io"]
|
||||
resources:
|
||||
- customresourcedefinitions
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
- apiGroups: ["scheduling.k8s.io"]
|
||||
resources:
|
||||
- priorityclasses
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: longhorn-helm-adopt
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: longhorn-helm-adopt
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: longhorn-helm-adopt
|
||||
namespace: longhorn-system
|
||||
@ -1,40 +0,0 @@
|
||||
# infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: longhorn-helm-adopt-2
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: longhorn-helm-adopt
|
||||
restartPolicy: Never
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: node-role.kubernetes.io/worker
|
||||
operator: Exists
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values: ["arm64"]
|
||||
containers:
|
||||
- name: adopt
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
command: ["/usr/bin/env", "bash"]
|
||||
args: ["/scripts/longhorn_helm_adopt.sh"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: longhorn-helm-adopt-script
|
||||
defaultMode: 0555
|
||||
@ -1,5 +0,0 @@
|
||||
# infrastructure/longhorn/adopt/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: longhorn-system
|
||||
@ -1,52 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
release_name="longhorn"
|
||||
release_namespace="longhorn-system"
|
||||
selector="app.kubernetes.io/instance=${release_name}"
|
||||
|
||||
annotate_and_label() {
|
||||
local scope="$1"
|
||||
local kind="$2"
|
||||
if [ "${scope}" = "namespaced" ]; then
|
||||
kubectl -n "${release_namespace}" annotate "${kind}" -l "${selector}" \
|
||||
meta.helm.sh/release-name="${release_name}" \
|
||||
meta.helm.sh/release-namespace="${release_namespace}" \
|
||||
--overwrite >/dev/null 2>&1 || true
|
||||
kubectl -n "${release_namespace}" label "${kind}" -l "${selector}" \
|
||||
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
|
||||
else
|
||||
kubectl annotate "${kind}" -l "${selector}" \
|
||||
meta.helm.sh/release-name="${release_name}" \
|
||||
meta.helm.sh/release-namespace="${release_namespace}" \
|
||||
--overwrite >/dev/null 2>&1 || true
|
||||
kubectl label "${kind}" -l "${selector}" \
|
||||
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
namespaced_kinds=(
|
||||
configmap
|
||||
service
|
||||
serviceaccount
|
||||
deployment
|
||||
daemonset
|
||||
job
|
||||
role
|
||||
rolebinding
|
||||
)
|
||||
|
||||
cluster_kinds=(
|
||||
clusterrole
|
||||
clusterrolebinding
|
||||
customresourcedefinition
|
||||
priorityclass
|
||||
)
|
||||
|
||||
for kind in "${namespaced_kinds[@]}"; do
|
||||
annotate_and_label "namespaced" "${kind}"
|
||||
done
|
||||
|
||||
for kind in "${cluster_kinds[@]}"; do
|
||||
annotate_and_label "cluster" "${kind}"
|
||||
done
|
||||
@ -1,80 +0,0 @@
|
||||
# infrastructure/longhorn/core/helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: longhorn
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
interval: 30m
|
||||
chart:
|
||||
spec:
|
||||
chart: longhorn
|
||||
version: 1.8.2
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: longhorn
|
||||
namespace: flux-system
|
||||
install:
|
||||
crds: Skip
|
||||
remediation: { retries: 3 }
|
||||
timeout: 15m
|
||||
upgrade:
|
||||
crds: Skip
|
||||
remediation:
|
||||
retries: 3
|
||||
remediateLastFailure: true
|
||||
cleanupOnFail: true
|
||||
timeout: 15m
|
||||
values:
|
||||
service:
|
||||
ui:
|
||||
type: NodePort
|
||||
nodePort: 30824
|
||||
privateRegistry:
|
||||
createSecret: false
|
||||
registrySecret: longhorn-registry
|
||||
image:
|
||||
pullPolicy: Always
|
||||
longhorn:
|
||||
engine:
|
||||
repository: registry.bstein.dev/infra/longhorn-engine
|
||||
tag: v1.8.2
|
||||
manager:
|
||||
repository: registry.bstein.dev/infra/longhorn-manager
|
||||
tag: v1.8.2
|
||||
ui:
|
||||
repository: registry.bstein.dev/infra/longhorn-ui
|
||||
tag: v1.8.2
|
||||
instanceManager:
|
||||
repository: registry.bstein.dev/infra/longhorn-instance-manager
|
||||
tag: v1.8.2
|
||||
shareManager:
|
||||
repository: registry.bstein.dev/infra/longhorn-share-manager
|
||||
tag: v1.8.2
|
||||
backingImageManager:
|
||||
repository: registry.bstein.dev/infra/longhorn-backing-image-manager
|
||||
tag: v1.8.2
|
||||
supportBundleKit:
|
||||
repository: registry.bstein.dev/infra/longhorn-support-bundle-kit
|
||||
tag: v0.0.56
|
||||
csi:
|
||||
attacher:
|
||||
repository: registry.bstein.dev/infra/longhorn-csi-attacher
|
||||
tag: v4.9.0
|
||||
provisioner:
|
||||
repository: registry.bstein.dev/infra/longhorn-csi-provisioner
|
||||
tag: v5.3.0
|
||||
nodeDriverRegistrar:
|
||||
repository: registry.bstein.dev/infra/longhorn-csi-node-driver-registrar
|
||||
tag: v2.14.0
|
||||
resizer:
|
||||
repository: registry.bstein.dev/infra/longhorn-csi-resizer
|
||||
tag: v1.13.2
|
||||
snapshotter:
|
||||
repository: registry.bstein.dev/infra/longhorn-csi-snapshotter
|
||||
tag: v8.2.0
|
||||
livenessProbe:
|
||||
repository: registry.bstein.dev/infra/longhorn-livenessprobe
|
||||
tag: v2.16.0
|
||||
defaultSettings:
|
||||
systemManagedPodsImagePullPolicy: Always
|
||||
@ -1,18 +0,0 @@
|
||||
# infrastructure/longhorn/core/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- vault-serviceaccount.yaml
|
||||
- secretproviderclass.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- helmrelease.yaml
|
||||
- longhorn-settings-ensure-job.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: longhorn-settings-ensure-script
|
||||
files:
|
||||
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
@ -1,36 +0,0 @@
|
||||
# infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: longhorn-settings-ensure-4
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: longhorn-service-account
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: longhorn-settings-ensure-script
|
||||
configMap:
|
||||
name: longhorn-settings-ensure-script
|
||||
defaultMode: 0555
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values: ["arm64"]
|
||||
- key: node-role.kubernetes.io/worker
|
||||
operator: Exists
|
||||
containers:
|
||||
- name: apply
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
command: ["/scripts/longhorn_settings_ensure.sh"]
|
||||
volumeMounts:
|
||||
- name: longhorn-settings-ensure-script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
@ -1,5 +0,0 @@
|
||||
# infrastructure/longhorn/core/namespace.yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: longhorn-system
|
||||
@ -1,42 +0,0 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
|
||||
|
||||
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
|
||||
|
||||
wait_for_api() {
|
||||
attempts=30
|
||||
while [ "${attempts}" -gt 0 ]; do
|
||||
if curl -fsS "${api_base}" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
attempts=$((attempts - 1))
|
||||
sleep 2
|
||||
done
|
||||
echo "Longhorn API not ready after retries." >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
update_setting() {
|
||||
name="$1"
|
||||
value="$2"
|
||||
|
||||
current="$(curl -fsS "${api_base}/${name}" || true)"
|
||||
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
|
||||
echo "Setting ${name} already set."
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Setting ${name} -> ${value}"
|
||||
curl -fsS -X PUT \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"value\":\"${value}\"}" \
|
||||
"${api_base}/${name}" >/dev/null
|
||||
}
|
||||
|
||||
wait_for_api
|
||||
update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v1.8.2"
|
||||
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
|
||||
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
|
||||
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
|
||||
@ -1,21 +0,0 @@
|
||||
# infrastructure/longhorn/core/secretproviderclass.yaml
|
||||
apiVersion: secrets-store.csi.x-k8s.io/v1
|
||||
kind: SecretProviderClass
|
||||
metadata:
|
||||
name: longhorn-vault
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
provider: vault
|
||||
parameters:
|
||||
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
|
||||
roleName: "longhorn"
|
||||
objects: |
|
||||
- objectName: "harbor-pull__dockerconfigjson"
|
||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
||||
secretKey: "dockerconfigjson"
|
||||
secretObjects:
|
||||
- secretName: longhorn-registry
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
- objectName: harbor-pull__dockerconfigjson
|
||||
key: .dockerconfigjson
|
||||
@ -1,6 +0,0 @@
|
||||
# infrastructure/longhorn/core/vault-serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: longhorn-vault-sync
|
||||
namespace: longhorn-system
|
||||
@ -1,45 +0,0 @@
|
||||
# infrastructure/longhorn/core/vault-sync-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: longhorn-vault-sync
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: longhorn-vault-sync
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: longhorn-vault-sync
|
||||
spec:
|
||||
serviceAccountName: longhorn-vault-sync
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 80
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi5", "rpi4"]
|
||||
containers:
|
||||
- name: sync
|
||||
image: alpine:3.20
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- "sleep infinity"
|
||||
volumeMounts:
|
||||
- name: vault-secrets
|
||||
mountPath: /vault/secrets
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: vault-secrets
|
||||
csi:
|
||||
driver: secrets-store.csi.k8s.io
|
||||
readOnly: true
|
||||
volumeAttributes:
|
||||
secretProviderClass: longhorn-vault
|
||||
@ -2,7 +2,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- serviceaccount.yaml
|
||||
- oauth2-proxy-longhorn.yaml
|
||||
- middleware.yaml
|
||||
- ingress.yaml
|
||||
- oauth2-proxy-longhorn.yaml
|
||||
|
||||
@ -32,18 +32,7 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: oauth2-proxy-longhorn
|
||||
annotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "longhorn"
|
||||
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/longhorn/oauth2-proxy"
|
||||
vault.hashicorp.com/agent-inject-template-oidc-config: |
|
||||
{{- with secret "kv/data/atlas/longhorn/oauth2-proxy" -}}
|
||||
client_id = "{{ .Data.data.client_id }}"
|
||||
client_secret = "{{ .Data.data.client_secret }}"
|
||||
cookie_secret = "{{ .Data.data.cookie_secret }}"
|
||||
{{- end -}}
|
||||
spec:
|
||||
serviceAccountName: longhorn-vault
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
@ -61,7 +50,6 @@ spec:
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- --provider=oidc
|
||||
- --config=/vault/secrets/oidc-config
|
||||
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
|
||||
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
||||
- --scope=openid profile email groups
|
||||
@ -81,6 +69,22 @@ spec:
|
||||
- --skip-jwt-bearer-tokens=true
|
||||
- --oidc-groups-claim=groups
|
||||
- --cookie-domain=longhorn.bstein.dev
|
||||
env:
|
||||
- name: OAUTH2_PROXY_CLIENT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: oauth2-proxy-longhorn-oidc
|
||||
key: client_id
|
||||
- name: OAUTH2_PROXY_CLIENT_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: oauth2-proxy-longhorn-oidc
|
||||
key: client_secret
|
||||
- name: OAUTH2_PROXY_COOKIE_SECRET
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: oauth2-proxy-longhorn-oidc
|
||||
key: cookie_secret
|
||||
ports:
|
||||
- containerPort: 4180
|
||||
name: http
|
||||
|
||||
@ -1,6 +0,0 @@
|
||||
# infrastructure/longhorn/ui-ingress/serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: longhorn-vault
|
||||
namespace: longhorn-system
|
||||
@ -1,47 +0,0 @@
|
||||
# infrastructure/metallb/helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: metallb
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
interval: 30m
|
||||
chart:
|
||||
spec:
|
||||
chart: metallb
|
||||
version: 0.15.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: metallb
|
||||
namespace: flux-system
|
||||
install:
|
||||
crds: CreateReplace
|
||||
remediation: { retries: 3 }
|
||||
timeout: 10m
|
||||
upgrade:
|
||||
crds: CreateReplace
|
||||
remediation:
|
||||
retries: 3
|
||||
remediateLastFailure: true
|
||||
cleanupOnFail: true
|
||||
timeout: 10m
|
||||
values:
|
||||
loadBalancerClass: metallb
|
||||
prometheus:
|
||||
metricsPort: 7472
|
||||
controller:
|
||||
logLevel: info
|
||||
webhookMode: enabled
|
||||
tlsMinVersion: VersionTLS12
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi4
|
||||
- rpi5
|
||||
speaker:
|
||||
logLevel: info
|
||||
@ -3,5 +3,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- helmrelease.yaml
|
||||
- metallb-rendered.yaml
|
||||
- ippool.yaml
|
||||
patchesStrategicMerge:
|
||||
- patches/node-placement.yaml
|
||||
- patches/speaker-loglevel.yaml
|
||||
|
||||
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
File diff suppressed because it is too large
Load Diff
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# infrastructure/metallb/patches/node-placement.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: metallb-controller
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: controller
|
||||
args:
|
||||
- --port=7472
|
||||
- --log-level=info
|
||||
- --webhook-mode=enabled
|
||||
- --tls-min-version=VersionTLS12
|
||||
- --lb-class=metallb
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi4
|
||||
- rpi5
|
||||
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
# infrastructure/metallb/patches/speaker-loglevel.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: metallb-speaker
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: speaker
|
||||
args:
|
||||
- --port=7472
|
||||
- --log-level=info
|
||||
- --lb-class=metallb
|
||||
@ -1,24 +0,0 @@
|
||||
# infrastructure/modules/base/storageclass/asteria-encrypted.yaml
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: asteria-encrypted
|
||||
parameters:
|
||||
diskSelector: asteria
|
||||
fromBackup: ""
|
||||
numberOfReplicas: "2"
|
||||
staleReplicaTimeout: "30"
|
||||
fsType: "ext4"
|
||||
replicaAutoBalance: "least-effort"
|
||||
dataLocality: "disabled"
|
||||
encrypted: "true"
|
||||
csi.storage.k8s.io/provisioner-secret-name: ${pvc.name}
|
||||
csi.storage.k8s.io/provisioner-secret-namespace: ${pvc.namespace}
|
||||
csi.storage.k8s.io/node-publish-secret-name: ${pvc.name}
|
||||
csi.storage.k8s.io/node-publish-secret-namespace: ${pvc.namespace}
|
||||
csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}
|
||||
csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace}
|
||||
provisioner: driver.longhorn.io
|
||||
reclaimPolicy: Retain
|
||||
allowVolumeExpansion: true
|
||||
volumeBindingMode: Immediate
|
||||
@ -3,5 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- asteria.yaml
|
||||
- asteria-encrypted.yaml
|
||||
- astreae.yaml
|
||||
|
||||
@ -11,5 +11,5 @@ spec:
|
||||
roleName: "postgres"
|
||||
objects: |
|
||||
- objectName: "postgres_password"
|
||||
secretPath: "kv/data/atlas/postgres/postgres-db"
|
||||
secretPath: "kv/data/postgres"
|
||||
secretKey: "POSTGRES_PASSWORD"
|
||||
|
||||
@ -4,10 +4,6 @@ kind: Service
|
||||
metadata:
|
||||
name: postgres-service
|
||||
namespace: postgres
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9187"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
@ -15,9 +11,5 @@ spec:
|
||||
port: 5432
|
||||
protocol: TCP
|
||||
targetPort: 5432
|
||||
- name: metrics
|
||||
port: 9187
|
||||
protocol: TCP
|
||||
targetPort: 9187
|
||||
selector:
|
||||
app: postgres
|
||||
|
||||
@ -58,23 +58,6 @@ spec:
|
||||
- name: vault-secrets
|
||||
mountPath: /mnt/vault
|
||||
readOnly: true
|
||||
- name: postgres-exporter
|
||||
image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9187
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: DATA_SOURCE_URI
|
||||
value: "localhost:5432/postgres?sslmode=disable"
|
||||
- name: DATA_SOURCE_USER
|
||||
value: postgres
|
||||
- name: DATA_SOURCE_PASS_FILE
|
||||
value: /mnt/vault/postgres_password
|
||||
volumeMounts:
|
||||
- name: vault-secrets
|
||||
mountPath: /mnt/vault
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: vault-secrets
|
||||
csi:
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
# infrastructure/sources/cert-manager/letsencrypt-prod.yaml
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-prod
|
||||
spec:
|
||||
acme:
|
||||
email: brad@bstein.dev
|
||||
email: brad.stein@gmail.com
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-prod-account-key
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
# infrastructure/sources/cert-manager/letsencrypt.yaml
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt
|
||||
spec:
|
||||
acme:
|
||||
email: brad@bstein.dev
|
||||
email: brad.stein@gmail.com
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-account-key
|
||||
|
||||
@ -1,9 +0,0 @@
|
||||
# infrastructure/sources/helm/ananace.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: ananace
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://ananace.gitlab.io/charts
|
||||
@ -2,18 +2,15 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ananace.yaml
|
||||
- fluent-bit.yaml
|
||||
- grafana.yaml
|
||||
- hashicorp.yaml
|
||||
- jetstack.yaml
|
||||
- jenkins.yaml
|
||||
- mailu.yaml
|
||||
- metallb.yaml
|
||||
- opentelemetry.yaml
|
||||
- opensearch.yaml
|
||||
- harbor.yaml
|
||||
- longhorn.yaml
|
||||
- prometheus.yaml
|
||||
- victoria-metrics.yaml
|
||||
- secrets-store-csi.yaml
|
||||
|
||||
@ -1,9 +0,0 @@
|
||||
# infrastructure/sources/helm/longhorn.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: longhorn
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
url: https://charts.longhorn.io
|
||||
@ -1,9 +0,0 @@
|
||||
# infrastructure/sources/helm/metallb.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: metallb
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://metallb.github.io/metallb
|
||||
File diff suppressed because it is too large
Load Diff
@ -27,8 +27,6 @@ items:
|
||||
creationTimestamp: null
|
||||
labels:
|
||||
app: traefik
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
|
||||
@ -5,7 +5,6 @@ metadata:
|
||||
name: traefik
|
||||
namespace: flux-system
|
||||
resources:
|
||||
- crds.yaml
|
||||
- deployment.yaml
|
||||
- serviceaccount.yaml
|
||||
- clusterrole.yaml
|
||||
|
||||
@ -3,10 +3,9 @@ apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: traefik
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
metallb.universe.tf/address-pool: communication-pool
|
||||
metallb.universe.tf/allow-shared-ip: traefik
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: metallb
|
||||
@ -21,4 +20,5 @@ spec:
|
||||
targetPort: websecure
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: traefik
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
|
||||
@ -17,5 +17,4 @@ spec:
|
||||
values:
|
||||
syncSecret:
|
||||
enabled: true
|
||||
enableSecretRotation: true
|
||||
rotationPollInterval: 2m
|
||||
enableSecretRotation: false
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
# infrastructure/vault-injector/helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: vault-injector
|
||||
namespace: vault
|
||||
spec:
|
||||
interval: 30m
|
||||
chart:
|
||||
spec:
|
||||
chart: vault
|
||||
version: 0.31.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: hashicorp
|
||||
namespace: flux-system
|
||||
install:
|
||||
remediation: { retries: 3 }
|
||||
timeout: 10m
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
remediateLastFailure: true
|
||||
cleanupOnFail: true
|
||||
timeout: 10m
|
||||
values:
|
||||
global:
|
||||
externalVaultAddr: http://vault.vault.svc.cluster.local:8200
|
||||
tlsDisable: true
|
||||
server:
|
||||
enabled: false
|
||||
csi:
|
||||
enabled: false
|
||||
injector:
|
||||
enabled: true
|
||||
replicas: 1
|
||||
agentImage:
|
||||
repository: hashicorp/vault
|
||||
tag: "1.17.6"
|
||||
webhook:
|
||||
failurePolicy: Ignore
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
@ -1,5 +0,0 @@
|
||||
# infrastructure/vault-injector/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrelease.yaml
|
||||
@ -1,8 +1,8 @@
|
||||
{
|
||||
"counts": {
|
||||
"helmrelease_host_hints": 19,
|
||||
"http_endpoints": 45,
|
||||
"services": 47,
|
||||
"workloads": 74
|
||||
"helmrelease_host_hints": 7,
|
||||
"http_endpoints": 35,
|
||||
"services": 44,
|
||||
"workloads": 49
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -17,11 +17,6 @@ flowchart LR
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
||||
host_budget_bstein_dev["budget.bstein.dev"]
|
||||
svc_finance_actual_budget["finance/actual-budget (Service)"]
|
||||
host_budget_bstein_dev --> svc_finance_actual_budget
|
||||
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
|
||||
svc_finance_actual_budget --> wl_finance_actual_budget
|
||||
host_call_live_bstein_dev["call.live.bstein.dev"]
|
||||
svc_comms_element_call["comms/element-call (Service)"]
|
||||
host_call_live_bstein_dev --> svc_comms_element_call
|
||||
@ -42,11 +37,6 @@ flowchart LR
|
||||
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
||||
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
||||
host_health_bstein_dev["health.bstein.dev"]
|
||||
svc_health_wger["health/wger (Service)"]
|
||||
host_health_bstein_dev --> svc_health_wger
|
||||
wl_health_wger["health/wger (Deployment)"]
|
||||
svc_health_wger --> wl_health_wger
|
||||
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
||||
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
||||
@ -57,22 +47,15 @@ flowchart LR
|
||||
wl_comms_livekit["comms/livekit (Deployment)"]
|
||||
svc_comms_livekit --> wl_comms_livekit
|
||||
host_live_bstein_dev["live.bstein.dev"]
|
||||
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_element_element_web
|
||||
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
|
||||
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
|
||||
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||
host_logs_bstein_dev["logs.bstein.dev"]
|
||||
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
|
||||
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
|
||||
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
|
||||
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
|
||||
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
|
||||
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
||||
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
||||
@ -82,25 +65,21 @@ flowchart LR
|
||||
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
||||
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
||||
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||
host_monero_bstein_dev["monero.bstein.dev"]
|
||||
svc_crypto_monerod["crypto/monerod (Service)"]
|
||||
host_monero_bstein_dev --> svc_crypto_monerod
|
||||
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
||||
svc_crypto_monerod --> wl_crypto_monerod
|
||||
host_money_bstein_dev["money.bstein.dev"]
|
||||
svc_finance_firefly["finance/firefly (Service)"]
|
||||
host_money_bstein_dev --> svc_finance_firefly
|
||||
wl_finance_firefly["finance/firefly (Deployment)"]
|
||||
svc_finance_firefly --> wl_finance_firefly
|
||||
host_notes_bstein_dev["notes.bstein.dev"]
|
||||
svc_outline_outline["outline/outline (Service)"]
|
||||
host_notes_bstein_dev --> svc_outline_outline
|
||||
wl_outline_outline["outline/outline (Deployment)"]
|
||||
svc_outline_outline --> wl_outline_outline
|
||||
host_office_bstein_dev["office.bstein.dev"]
|
||||
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
||||
host_office_bstein_dev --> svc_nextcloud_collabora
|
||||
@ -131,11 +110,6 @@ flowchart LR
|
||||
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
||||
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
||||
host_tasks_bstein_dev["tasks.bstein.dev"]
|
||||
svc_planka_planka["planka/planka (Service)"]
|
||||
host_tasks_bstein_dev --> svc_planka_planka
|
||||
wl_planka_planka["planka/planka (Deployment)"]
|
||||
svc_planka_planka --> wl_planka_planka
|
||||
host_vault_bstein_dev["vault.bstein.dev"]
|
||||
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
||||
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
||||
@ -159,30 +133,23 @@ flowchart LR
|
||||
wl_comms_livekit_token_service
|
||||
svc_comms_livekit
|
||||
wl_comms_livekit
|
||||
svc_comms_othrys_element_element_web
|
||||
wl_comms_othrys_element_element_web
|
||||
svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register
|
||||
wl_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service
|
||||
svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register
|
||||
end
|
||||
subgraph crypto[crypto]
|
||||
svc_crypto_monerod
|
||||
wl_crypto_monerod
|
||||
end
|
||||
subgraph finance[finance]
|
||||
svc_finance_actual_budget
|
||||
wl_finance_actual_budget
|
||||
svc_finance_firefly
|
||||
wl_finance_firefly
|
||||
end
|
||||
subgraph gitea[gitea]
|
||||
svc_gitea_gitea
|
||||
wl_gitea_gitea
|
||||
end
|
||||
subgraph health[health]
|
||||
svc_health_wger
|
||||
wl_health_wger
|
||||
end
|
||||
subgraph jellyfin[jellyfin]
|
||||
svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus
|
||||
@ -193,10 +160,6 @@ flowchart LR
|
||||
svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins
|
||||
end
|
||||
subgraph logging[logging]
|
||||
svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs
|
||||
end
|
||||
subgraph longhorn_system[longhorn-system]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn
|
||||
@ -210,14 +173,6 @@ flowchart LR
|
||||
svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora
|
||||
end
|
||||
subgraph outline[outline]
|
||||
svc_outline_outline
|
||||
wl_outline_outline
|
||||
end
|
||||
subgraph planka[planka]
|
||||
svc_planka_planka
|
||||
wl_planka_planka
|
||||
end
|
||||
subgraph sso[sso]
|
||||
svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy
|
||||
|
||||
@ -70,7 +70,6 @@ WORKER_NODES = [
|
||||
"titan-13",
|
||||
"titan-14",
|
||||
"titan-15",
|
||||
"titan-16",
|
||||
"titan-17",
|
||||
"titan-18",
|
||||
"titan-19",
|
||||
@ -86,17 +85,19 @@ WORKER_TOTAL = len(WORKER_NODES)
|
||||
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||
# Namespaces considered infrastructure (excluded from workload counts)
|
||||
INFRA_PATTERNS = [
|
||||
"kube-.*",
|
||||
".*-system",
|
||||
"traefik",
|
||||
INFRA_NAMESPACES = [
|
||||
"kube-system",
|
||||
"longhorn-system",
|
||||
"metallb-system",
|
||||
"monitoring",
|
||||
"logging",
|
||||
"cert-manager",
|
||||
"flux-system",
|
||||
"traefik",
|
||||
"maintenance",
|
||||
"postgres",
|
||||
]
|
||||
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
|
||||
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
|
||||
# Namespaces allowed on control plane without counting as workloads
|
||||
CP_ALLOWED_NS = INFRA_REGEX
|
||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||
@ -208,66 +209,7 @@ def namespace_ram_raw(scope_var):
|
||||
|
||||
|
||||
def namespace_gpu_usage_instant(scope_var):
|
||||
return gpu_usage_by_namespace(scope_var)
|
||||
|
||||
|
||||
def jetson_gpu_util_by_node():
|
||||
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
||||
|
||||
|
||||
def dcgm_gpu_util_by_node():
|
||||
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
|
||||
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
|
||||
return (
|
||||
"avg by (node) ("
|
||||
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
|
||||
'kube_pod_info{namespace="monitoring"}'
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
def gpu_util_by_node():
|
||||
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
|
||||
|
||||
|
||||
def gpu_util_by_hostname():
|
||||
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
|
||||
|
||||
|
||||
def gpu_node_labels():
|
||||
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
|
||||
|
||||
|
||||
def gpu_requests_by_namespace_node(scope_var):
|
||||
return (
|
||||
"sum by (namespace,node) ("
|
||||
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
||||
f"* on(node) group_left() ({gpu_node_labels()})"
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
def gpu_usage_by_namespace(scope_var):
|
||||
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
return (
|
||||
"sum by (namespace) ("
|
||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
||||
f"* on(node) group_left() ({gpu_util_by_node()})"
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
def jetson_gpu_usage_by_namespace(scope_var):
|
||||
requests_by_ns = jetson_gpu_requests(scope_var)
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
return (
|
||||
"sum by (namespace) ("
|
||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
||||
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
|
||||
")"
|
||||
)
|
||||
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
|
||||
|
||||
|
||||
def namespace_share_expr(resource_expr):
|
||||
@ -287,7 +229,7 @@ def namespace_gpu_share_expr(scope_var):
|
||||
usage = namespace_gpu_usage_instant(scope_var)
|
||||
total = f"(sum({usage}) or on() vector(0))"
|
||||
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
|
||||
return f"({share}) or ({idle})"
|
||||
|
||||
|
||||
@ -377,76 +319,6 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
|
||||
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
|
||||
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
|
||||
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
|
||||
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
|
||||
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
|
||||
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
|
||||
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
|
||||
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
|
||||
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
|
||||
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
|
||||
GLUE_STALE_WINDOW_SEC = 36 * 3600
|
||||
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
|
||||
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
|
||||
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
|
||||
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
|
||||
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
|
||||
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
|
||||
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
|
||||
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
|
||||
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
|
||||
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
|
||||
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
|
||||
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
|
||||
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
|
||||
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
|
||||
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
|
||||
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
|
||||
ARIADNE_TASK_WARNINGS_SERIES = (
|
||||
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
|
||||
)
|
||||
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
|
||||
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
|
||||
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
|
||||
"(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
|
||||
)
|
||||
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
||||
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
||||
)
|
||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
||||
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
||||
ARIADNE_TEST_SUCCESS_RATE = (
|
||||
"100 * "
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
|
||||
"/ clamp_min("
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
|
||||
)
|
||||
ARIADNE_TEST_FAILURES_24H = (
|
||||
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
||||
)
|
||||
POSTGRES_CONN_USED = (
|
||||
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
||||
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
|
||||
)
|
||||
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
|
||||
ONEOFF_JOB_OWNER = (
|
||||
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
|
||||
)
|
||||
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
|
||||
ONEOFF_JOB_POD_AGE_HOURS = (
|
||||
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
|
||||
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
|
||||
'* on(namespace,pod) group_left(phase) '
|
||||
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
|
||||
)
|
||||
GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
|
||||
GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
|
||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
@ -624,7 +496,6 @@ def timeseries_panel(
|
||||
grid,
|
||||
*,
|
||||
unit="none",
|
||||
max_value=None,
|
||||
legend=None,
|
||||
legend_display="table",
|
||||
legend_placement="bottom",
|
||||
@ -649,8 +520,6 @@ def timeseries_panel(
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
if max_value is not None:
|
||||
panel["fieldConfig"]["defaults"]["max"] = max_value
|
||||
if legend:
|
||||
panel["targets"][0]["legendFormat"] = legend
|
||||
if legend_calcs:
|
||||
@ -802,22 +671,13 @@ def bargauge_panel(
|
||||
grid,
|
||||
*,
|
||||
unit="none",
|
||||
legend=None,
|
||||
links=None,
|
||||
limit=None,
|
||||
sort_order="desc",
|
||||
thresholds=None,
|
||||
decimals=None,
|
||||
instant=False,
|
||||
overrides=None,
|
||||
):
|
||||
"""Return a bar gauge panel with label-aware reduction."""
|
||||
cleaned_expr = expr.strip()
|
||||
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
|
||||
if sort_order == "desc":
|
||||
expr = f"sort_desc({expr})"
|
||||
elif sort_order == "asc":
|
||||
expr = f"sort({expr})"
|
||||
panel = {
|
||||
"id": panel_id,
|
||||
"type": "bargauge",
|
||||
@ -825,12 +685,7 @@ def bargauge_panel(
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [
|
||||
{
|
||||
"expr": expr,
|
||||
"refId": "A",
|
||||
"legendFormat": legend or "{{node}}",
|
||||
**({"instant": True} if instant else {}),
|
||||
}
|
||||
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
@ -860,8 +715,6 @@ def bargauge_panel(
|
||||
},
|
||||
},
|
||||
}
|
||||
if overrides:
|
||||
panel["fieldConfig"]["overrides"].extend(overrides)
|
||||
if decimals is not None:
|
||||
panel["fieldConfig"]["defaults"]["decimals"] = decimals
|
||||
if links:
|
||||
@ -870,7 +723,7 @@ def bargauge_panel(
|
||||
panel["transformations"] = [
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {"fields": ["Value"], "order": sort_order},
|
||||
"options": {"fields": ["Value"], "order": "desc"},
|
||||
}
|
||||
]
|
||||
if limit:
|
||||
@ -910,15 +763,6 @@ def build_overview():
|
||||
{"color": "red", "value": 3},
|
||||
],
|
||||
}
|
||||
age_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 6},
|
||||
{"color": "orange", "value": 24},
|
||||
{"color": "red", "value": 48},
|
||||
],
|
||||
}
|
||||
|
||||
row1_stats = [
|
||||
{
|
||||
@ -1121,7 +965,7 @@ def build_overview():
|
||||
30,
|
||||
"Mail Sent (1d)",
|
||||
'max(postmark_outbound_sent{window="1d"})',
|
||||
{"h": 3, "w": 4, "x": 0, "y": 8},
|
||||
{"h": 2, "w": 6, "x": 0, "y": 8},
|
||||
unit="none",
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
@ -1132,7 +976,7 @@ def build_overview():
|
||||
"type": "stat",
|
||||
"title": "Mail Bounces (1d)",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
|
||||
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
||||
@ -1178,7 +1022,7 @@ def build_overview():
|
||||
32,
|
||||
"Mail Success Rate (1d)",
|
||||
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
||||
{"h": 3, "w": 4, "x": 4, "y": 8},
|
||||
{"h": 2, "w": 6, "x": 6, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_success_thresholds,
|
||||
decimals=1,
|
||||
@ -1190,38 +1034,13 @@ def build_overview():
|
||||
33,
|
||||
"Mail Limit Used (30d)",
|
||||
"max(postmark_sending_limit_used_percent)",
|
||||
{"h": 3, "w": 4, "x": 12, "y": 8},
|
||||
{"h": 2, "w": 6, "x": 18, "y": 8},
|
||||
unit="percent",
|
||||
thresholds=mail_limit_thresholds,
|
||||
decimals=1,
|
||||
links=link_to("atlas-mail"),
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
34,
|
||||
"Postgres Connections Used",
|
||||
POSTGRES_CONN_USED,
|
||||
{"h": 3, "w": 4, "x": 16, "y": 8},
|
||||
decimals=0,
|
||||
text_mode="name_and_value",
|
||||
legend="{{conn}}",
|
||||
instant=True,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
35,
|
||||
"Postgres Hottest Connections",
|
||||
POSTGRES_CONN_HOTTEST,
|
||||
{"h": 3, "w": 4, "x": 20, "y": 8},
|
||||
unit="none",
|
||||
decimals=0,
|
||||
text_mode="name_and_value",
|
||||
legend="{{datname}}",
|
||||
instant=True,
|
||||
)
|
||||
)
|
||||
|
||||
storage_panels = [
|
||||
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
@ -1235,104 +1054,13 @@ def build_overview():
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
|
||||
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
links=link_to("atlas-storage"),
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
40,
|
||||
"One-off Job Pods (age hours)",
|
||||
ONEOFF_JOB_POD_AGE_HOURS,
|
||||
{"h": 6, "w": 6, "x": 0, "y": 14},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{pod}}",
|
||||
thresholds=age_thresholds,
|
||||
limit=8,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts / Failures",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
|
||||
"targets": [
|
||||
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
|
||||
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {"unit": "none"},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Attempts"},
|
||||
"properties": [
|
||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
|
||||
],
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Failures"},
|
||||
"properties": [
|
||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "table", "placement": "right"},
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
42,
|
||||
"Ariadne Test Success Rate",
|
||||
ARIADNE_TEST_SUCCESS_RATE,
|
||||
{"h": 6, "w": 6, "x": 12, "y": 14},
|
||||
unit="percent",
|
||||
max_value=100,
|
||||
legend=None,
|
||||
legend_display="list",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
43,
|
||||
"Tests with Failures (24h)",
|
||||
ARIADNE_TEST_FAILURES_24H,
|
||||
{"h": 6, "w": 6, "x": 18, "y": 14},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{result}}",
|
||||
overrides=[
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "error"},
|
||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "failed"},
|
||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
|
||||
},
|
||||
],
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 5},
|
||||
{"color": "red", "value": 10},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
cpu_scope = "$namespace_scope_cpu"
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
ram_scope = "$namespace_scope_ram"
|
||||
@ -1342,9 +1070,9 @@ def build_overview():
|
||||
11,
|
||||
"Namespace CPU Share",
|
||||
namespace_cpu_share_expr(cpu_scope),
|
||||
{"h": 9, "w": 8, "x": 0, "y": 20},
|
||||
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_cpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -1352,9 +1080,9 @@ def build_overview():
|
||||
12,
|
||||
"Namespace GPU Share",
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 20},
|
||||
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -1362,9 +1090,9 @@ def build_overview():
|
||||
13,
|
||||
"Namespace RAM Share",
|
||||
namespace_ram_share_expr(ram_scope),
|
||||
{"h": 9, "w": 8, "x": 16, "y": 20},
|
||||
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||
links=namespace_scope_links("namespace_scope_ram"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
|
||||
@ -1374,7 +1102,7 @@ def build_overview():
|
||||
14,
|
||||
"Worker Node CPU",
|
||||
node_cpu_expr(worker_filter),
|
||||
{"h": 12, "w": 12, "x": 0, "y": 36},
|
||||
{"h": 12, "w": 12, "x": 0, "y": 32},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1388,7 +1116,7 @@ def build_overview():
|
||||
15,
|
||||
"Worker Node RAM",
|
||||
node_mem_expr(worker_filter),
|
||||
{"h": 12, "w": 12, "x": 12, "y": 36},
|
||||
{"h": 12, "w": 12, "x": 12, "y": 32},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1403,7 +1131,7 @@ def build_overview():
|
||||
16,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr(CONTROL_ALL_REGEX),
|
||||
{"h": 10, "w": 12, "x": 0, "y": 48},
|
||||
{"h": 10, "w": 12, "x": 0, "y": 44},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -1415,7 +1143,7 @@ def build_overview():
|
||||
17,
|
||||
"Control plane RAM",
|
||||
node_mem_expr(CONTROL_ALL_REGEX),
|
||||
{"h": 10, "w": 12, "x": 12, "y": 48},
|
||||
{"h": 10, "w": 12, "x": 12, "y": 44},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_display="table",
|
||||
@ -1428,7 +1156,7 @@ def build_overview():
|
||||
28,
|
||||
"Node Pod Share",
|
||||
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
||||
{"h": 10, "w": 12, "x": 0, "y": 58},
|
||||
{"h": 10, "w": 12, "x": 0, "y": 54},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -1436,7 +1164,7 @@ def build_overview():
|
||||
29,
|
||||
"Top Nodes by Pod Count",
|
||||
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
||||
{"h": 10, "w": 12, "x": 12, "y": 58},
|
||||
{"h": 10, "w": 12, "x": 12, "y": 54},
|
||||
unit="none",
|
||||
limit=12,
|
||||
decimals=0,
|
||||
@ -1458,7 +1186,7 @@ def build_overview():
|
||||
18,
|
||||
"Cluster Ingress Throughput",
|
||||
NET_INGRESS_EXPR,
|
||||
{"h": 7, "w": 8, "x": 0, "y": 29},
|
||||
{"h": 7, "w": 8, "x": 0, "y": 25},
|
||||
unit="Bps",
|
||||
legend="Ingress (Traefik)",
|
||||
legend_display="list",
|
||||
@ -1471,7 +1199,7 @@ def build_overview():
|
||||
19,
|
||||
"Cluster Egress Throughput",
|
||||
NET_EGRESS_EXPR,
|
||||
{"h": 7, "w": 8, "x": 8, "y": 29},
|
||||
{"h": 7, "w": 8, "x": 8, "y": 25},
|
||||
unit="Bps",
|
||||
legend="Egress (Traefik)",
|
||||
legend_display="list",
|
||||
@ -1484,7 +1212,7 @@ def build_overview():
|
||||
20,
|
||||
"Intra-Cluster Throughput",
|
||||
NET_INTERNAL_EXPR,
|
||||
{"h": 7, "w": 8, "x": 16, "y": 29},
|
||||
{"h": 7, "w": 8, "x": 16, "y": 25},
|
||||
unit="Bps",
|
||||
legend="Internal traffic",
|
||||
legend_display="list",
|
||||
@ -1498,7 +1226,7 @@ def build_overview():
|
||||
21,
|
||||
"Root Filesystem Usage",
|
||||
root_usage_expr(),
|
||||
{"h": 16, "w": 12, "x": 0, "y": 68},
|
||||
{"h": 16, "w": 12, "x": 0, "y": 64},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
@ -1513,7 +1241,7 @@ def build_overview():
|
||||
22,
|
||||
"Nodes Closest to Full Root Disks",
|
||||
f"topk(12, {root_usage_expr()})",
|
||||
{"h": 16, "w": 12, "x": 12, "y": 68},
|
||||
{"h": 16, "w": 12, "x": 12, "y": 64},
|
||||
unit="percent",
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
links=link_to("atlas-storage"),
|
||||
@ -1999,7 +1727,7 @@ def build_storage_dashboard():
|
||||
stat_panel(
|
||||
31,
|
||||
"Maintenance Cron Freshness (s)",
|
||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
|
||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||
{"h": 4, "w": 12, "x": 12, "y": 44},
|
||||
unit="s",
|
||||
thresholds={
|
||||
@ -2408,285 +2136,6 @@ def build_mail_dashboard():
|
||||
}
|
||||
|
||||
|
||||
def build_jobs_dashboard():
|
||||
panels = []
|
||||
age_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 6},
|
||||
{"color": "orange", "value": 24},
|
||||
{"color": "red", "value": 48},
|
||||
],
|
||||
}
|
||||
recent_error_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": None},
|
||||
{"color": "orange", "value": 1},
|
||||
{"color": "yellow", "value": 6},
|
||||
{"color": "green", "value": 24},
|
||||
],
|
||||
}
|
||||
|
||||
task_error_thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 3},
|
||||
{"color": "red", "value": 5},
|
||||
],
|
||||
}
|
||||
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
1,
|
||||
"Ariadne Task Errors (range)",
|
||||
ARIADNE_TASK_ERRORS_RANGE,
|
||||
{"h": 7, "w": 8, "x": 0, "y": 0},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Attempts / Failures",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
|
||||
"targets": [
|
||||
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
|
||||
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {"unit": "none"},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Attempts"},
|
||||
"properties": [
|
||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
|
||||
],
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Failures"},
|
||||
"properties": [
|
||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "table", "placement": "right"},
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
3,
|
||||
"One-off Job Pods (age hours)",
|
||||
ONEOFF_JOB_POD_AGE_HOURS,
|
||||
{"h": 7, "w": 8, "x": 16, "y": 0},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{pod}}",
|
||||
thresholds=age_thresholds,
|
||||
limit=12,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
4,
|
||||
"Glue Jobs Stale (>36h)",
|
||||
GLUE_STALE_COUNT,
|
||||
{"h": 4, "w": 4, "x": 0, "y": 7},
|
||||
unit="none",
|
||||
thresholds={
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 2},
|
||||
{"color": "red", "value": 3},
|
||||
],
|
||||
},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
5,
|
||||
"Glue Jobs Missing Success",
|
||||
GLUE_MISSING_COUNT,
|
||||
{"h": 4, "w": 4, "x": 4, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
6,
|
||||
"Glue Jobs Suspended",
|
||||
GLUE_SUSPENDED_COUNT,
|
||||
{"h": 4, "w": 4, "x": 8, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
7,
|
||||
"Ariadne Task Errors (1h)",
|
||||
ARIADNE_TASK_ERRORS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 12, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
8,
|
||||
"Ariadne Task Errors (24h)",
|
||||
ARIADNE_TASK_ERRORS_24H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 16, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
9,
|
||||
"Ariadne Task Runs (1h)",
|
||||
ARIADNE_TASK_RUNS_1H_TOTAL,
|
||||
{"h": 4, "w": 4, "x": 20, "y": 7},
|
||||
unit="none",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
10,
|
||||
"Ariadne Schedule Last Error (hours ago)",
|
||||
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
|
||||
{"h": 6, "w": 12, "x": 0, "y": 17},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=recent_error_thresholds,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
11,
|
||||
"Ariadne Schedule Last Success (hours ago)",
|
||||
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 17},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=age_thresholds,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
12,
|
||||
"Glue Jobs Last Success (hours ago)",
|
||||
GLUE_LAST_SUCCESS_RANGE_HOURS,
|
||||
{"h": 6, "w": 12, "x": 0, "y": 23},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{cronjob}}",
|
||||
thresholds=age_thresholds,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
13,
|
||||
"Glue Jobs Last Schedule (hours ago)",
|
||||
GLUE_LAST_SCHEDULE_RANGE_HOURS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 23},
|
||||
unit="h",
|
||||
instant=True,
|
||||
legend="{{namespace}}/{{cronjob}}",
|
||||
thresholds=age_thresholds,
|
||||
decimals=2,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
14,
|
||||
"Ariadne Task Errors (1h)",
|
||||
ARIADNE_TASK_ERRORS_1H,
|
||||
{"h": 6, "w": 12, "x": 0, "y": 29},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
15,
|
||||
"Ariadne Task Errors (30d)",
|
||||
ARIADNE_TASK_ERRORS_30D,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 29},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{task}}",
|
||||
thresholds=task_error_thresholds,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
16,
|
||||
"Ariadne Access Requests",
|
||||
ARIADNE_ACCESS_REQUESTS,
|
||||
{"h": 6, "w": 8, "x": 0, "y": 11},
|
||||
unit="none",
|
||||
instant=True,
|
||||
legend="{{status}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
17,
|
||||
"Ariadne CI Coverage (%)",
|
||||
ARIADNE_CI_COVERAGE,
|
||||
{"h": 6, "w": 4, "x": 8, "y": 11},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
instant=True,
|
||||
legend="{{branch}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
18,
|
||||
"Ariadne CI Tests (latest)",
|
||||
ARIADNE_CI_TESTS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 11},
|
||||
unit="none",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
||||
instant=True,
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-jobs",
|
||||
"title": "Atlas Jobs",
|
||||
"folderUid": PRIVATE_FOLDER,
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
"time": {"from": "now-7d", "to": "now"},
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "jobs", "glue"],
|
||||
}
|
||||
|
||||
|
||||
def build_gpu_dashboard():
|
||||
panels = []
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
@ -2697,7 +2146,7 @@ def build_gpu_dashboard():
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -2716,7 +2165,7 @@ def build_gpu_dashboard():
|
||||
timeseries_panel(
|
||||
3,
|
||||
"GPU Util by Node",
|
||||
gpu_util_by_hostname(),
|
||||
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
unit="percent",
|
||||
legend="{{Hostname}}",
|
||||
@ -2780,10 +2229,6 @@ DASHBOARDS = {
|
||||
"builder": build_mail_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
||||
},
|
||||
"atlas-jobs": {
|
||||
"builder": build_jobs_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
|
||||
},
|
||||
"atlas-gpu": {
|
||||
"builder": build_gpu_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
||||
|
||||
@ -20,13 +20,11 @@ import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
from typing import Any, Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
|
||||
|
||||
CLUSTER_SCOPED_KINDS = {
|
||||
"Namespace",
|
||||
@ -62,70 +60,6 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
|
||||
return res.stdout
|
||||
|
||||
|
||||
def _sync_tree(source: Path, dest: Path) -> None:
|
||||
if dest.exists():
|
||||
shutil.rmtree(dest)
|
||||
shutil.copytree(source, dest)
|
||||
|
||||
|
||||
def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
||||
panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
|
||||
for panel in panels:
|
||||
if not isinstance(panel, dict):
|
||||
continue
|
||||
if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
|
||||
yield from _iter_dashboard_panels({"panels": panel.get("panels")})
|
||||
continue
|
||||
yield panel
|
||||
|
||||
|
||||
def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
|
||||
index: list[dict[str, Any]] = []
|
||||
for path in sorted(dashboard_dir.glob("*.json")):
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
dash_title = data.get("title") or path.stem
|
||||
dash_tags = data.get("tags") or []
|
||||
for panel in _iter_dashboard_panels(data):
|
||||
targets = panel.get("targets")
|
||||
if not isinstance(targets, list):
|
||||
continue
|
||||
exprs: list[str] = []
|
||||
for target in targets:
|
||||
if not isinstance(target, dict):
|
||||
continue
|
||||
expr = target.get("expr")
|
||||
if isinstance(expr, str) and expr.strip():
|
||||
exprs.append(expr.strip())
|
||||
if not exprs:
|
||||
continue
|
||||
datasource = panel.get("datasource") or {}
|
||||
if isinstance(datasource, dict):
|
||||
ds_uid = datasource.get("uid")
|
||||
ds_type = datasource.get("type")
|
||||
else:
|
||||
ds_uid = None
|
||||
ds_type = None
|
||||
index.append(
|
||||
{
|
||||
"dashboard": dash_title,
|
||||
"panel_title": panel.get("title") or "",
|
||||
"panel_id": panel.get("id"),
|
||||
"panel_type": panel.get("type"),
|
||||
"description": panel.get("description") or "",
|
||||
"tags": dash_tags,
|
||||
"datasource_uid": ds_uid,
|
||||
"datasource_type": ds_type,
|
||||
"exprs": exprs,
|
||||
}
|
||||
)
|
||||
return index
|
||||
|
||||
|
||||
def kustomize_build(path: Path) -> str:
|
||||
rel = path.relative_to(REPO_ROOT)
|
||||
try:
|
||||
@ -538,11 +472,6 @@ def main() -> int:
|
||||
action="store_true",
|
||||
help="Write generated files (otherwise just print a summary).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--sync-comms",
|
||||
action="store_true",
|
||||
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
out_dir = REPO_ROOT / args.out
|
||||
@ -575,11 +504,8 @@ def main() -> int:
|
||||
summary_path = out_dir / "catalog" / "atlas-summary.json"
|
||||
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
|
||||
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
|
||||
metrics_json_path = out_dir / "catalog" / "metrics.json"
|
||||
|
||||
catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
|
||||
catalog_path.write_text(
|
||||
f"# {catalog_rel}\n"
|
||||
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
|
||||
+ yaml.safe_dump(catalog, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
@ -589,14 +515,9 @@ def main() -> int:
|
||||
diagram_path.write_text(diagram, encoding="utf-8")
|
||||
|
||||
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
|
||||
runbook_dirs = [
|
||||
out_dir / "runbooks",
|
||||
out_dir / "software",
|
||||
]
|
||||
runbooks_dir = out_dir / "runbooks"
|
||||
runbooks: list[dict[str, Any]] = []
|
||||
for runbooks_dir in runbook_dirs:
|
||||
if not runbooks_dir.exists():
|
||||
continue
|
||||
if runbooks_dir.exists():
|
||||
for md_file in sorted(runbooks_dir.glob("*.md")):
|
||||
raw = md_file.read_text(encoding="utf-8")
|
||||
fm: dict[str, Any] = {}
|
||||
@ -620,22 +541,12 @@ def main() -> int:
|
||||
}
|
||||
)
|
||||
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
|
||||
metrics_index = _extract_metrics_index(DASHBOARD_DIR)
|
||||
metrics_json_path.write_text(
|
||||
json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
|
||||
|
||||
if args.sync_comms:
|
||||
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
|
||||
_sync_tree(out_dir, comms_dir)
|
||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@ -7,8 +7,6 @@ test accounts created via the bstein-dev-home onboarding portal.
|
||||
Targets (best-effort):
|
||||
- Keycloak users in realm "atlas"
|
||||
- Atlas portal Postgres rows (access_requests + dependent tables)
|
||||
- Mailu mailboxes created for test users
|
||||
- Nextcloud Mail accounts created for test users
|
||||
- Vaultwarden users/invites created by the portal
|
||||
|
||||
Safety:
|
||||
@ -58,19 +56,6 @@ class VaultwardenUser:
|
||||
status: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MailuUser:
|
||||
email: str
|
||||
localpart: str
|
||||
domain: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NextcloudMailAccount:
|
||||
account_id: str
|
||||
email: str
|
||||
|
||||
|
||||
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
@ -85,19 +70,6 @@ def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
||||
return proc.stdout.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def _run_capture(cmd: list[str], *, input_bytes: bytes | None = None) -> tuple[int, str, str]:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=input_bytes,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
)
|
||||
stdout = proc.stdout.decode("utf-8", errors="replace")
|
||||
stderr = proc.stderr.decode("utf-8", errors="replace")
|
||||
return proc.returncode, stdout, stderr
|
||||
|
||||
|
||||
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
|
||||
raw_b64 = _run(
|
||||
[
|
||||
@ -138,21 +110,6 @@ def _kubectl_first_pod(namespace: str) -> str:
|
||||
return pod_name
|
||||
|
||||
|
||||
def _kubectl_exec(namespace: str, target: str, cmd: list[str]) -> tuple[int, str, str]:
|
||||
return _run_capture(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
namespace,
|
||||
"exec",
|
||||
"-i",
|
||||
target,
|
||||
"--",
|
||||
*cmd,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _validate_prefixes(prefixes: list[str]) -> list[str]:
|
||||
cleaned: list[str] = []
|
||||
for prefix in prefixes:
|
||||
@ -230,62 +187,6 @@ def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) ->
|
||||
raise
|
||||
|
||||
|
||||
def _sql_quote(value: str) -> str:
|
||||
return "'" + value.replace("'", "''") + "'"
|
||||
|
||||
|
||||
def _psql_exec(db_name: str, sql: str, *, user: str = "postgres") -> str:
|
||||
postgres_pod = _kubectl_first_pod("postgres")
|
||||
return _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
"postgres",
|
||||
"exec",
|
||||
"-i",
|
||||
postgres_pod,
|
||||
"--",
|
||||
"psql",
|
||||
"-U",
|
||||
user,
|
||||
"-d",
|
||||
db_name,
|
||||
"-c",
|
||||
sql,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _psql_tsv(db_name: str, sql: str, *, user: str = "postgres") -> list[list[str]]:
|
||||
postgres_pod = _kubectl_first_pod("postgres")
|
||||
out = _run(
|
||||
[
|
||||
"kubectl",
|
||||
"-n",
|
||||
"postgres",
|
||||
"exec",
|
||||
"-i",
|
||||
postgres_pod,
|
||||
"--",
|
||||
"psql",
|
||||
"-U",
|
||||
user,
|
||||
"-d",
|
||||
db_name,
|
||||
"-At",
|
||||
"-F",
|
||||
"\t",
|
||||
"-c",
|
||||
sql,
|
||||
]
|
||||
)
|
||||
rows: list[list[str]] = []
|
||||
for line in out.splitlines():
|
||||
parts = line.split("\t")
|
||||
rows.append(parts)
|
||||
return rows
|
||||
|
||||
|
||||
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
|
||||
postgres_pod = _kubectl_first_pod("postgres")
|
||||
out = _run(
|
||||
@ -355,89 +256,6 @@ def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
|
||||
def _mailu_list_users(prefixes: list[str], domain: str, db_name: str, protected: set[str]) -> list[MailuUser]:
|
||||
if not prefixes or not domain:
|
||||
return []
|
||||
clauses = " OR ".join([f"localpart LIKE '{p}%'" for p in prefixes])
|
||||
sql = (
|
||||
'SELECT email, localpart, domain_name '
|
||||
'FROM "user" '
|
||||
f"WHERE domain_name = {_sql_quote(domain)} AND ({clauses}) "
|
||||
"ORDER BY email;"
|
||||
)
|
||||
rows = _psql_tsv(db_name, sql)
|
||||
users: list[MailuUser] = []
|
||||
for row in rows:
|
||||
if len(row) < 3:
|
||||
continue
|
||||
email = row[0].strip()
|
||||
if not email or email in protected:
|
||||
continue
|
||||
users.append(MailuUser(email=email, localpart=row[1].strip(), domain=row[2].strip()))
|
||||
return users
|
||||
|
||||
|
||||
def _mailu_delete_users(db_name: str, emails: list[str]) -> int:
|
||||
if not emails:
|
||||
return 0
|
||||
email_list = ",".join(_sql_quote(e) for e in emails)
|
||||
sql = f'DELETE FROM "user" WHERE email IN ({email_list});'
|
||||
out = _psql_exec(db_name, sql)
|
||||
match = re.search(r"DELETE\\s+(\\d+)", out)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
|
||||
_NEXTCLOUD_ACCOUNT_RE = re.compile(r"^Account\\s+(\\d+):")
|
||||
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+")
|
||||
|
||||
|
||||
def _nextcloud_exec(cmd: list[str]) -> tuple[int, str, str]:
|
||||
namespace = os.getenv("NEXTCLOUD_NAMESPACE", "nextcloud").strip() or "nextcloud"
|
||||
target = os.getenv("NEXTCLOUD_EXEC_TARGET", "deploy/nextcloud").strip() or "deploy/nextcloud"
|
||||
return _kubectl_exec(namespace, target, cmd)
|
||||
|
||||
|
||||
def _parse_nextcloud_mail_accounts(export_output: str) -> list[NextcloudMailAccount]:
|
||||
accounts: list[NextcloudMailAccount] = []
|
||||
current_id = ""
|
||||
for line in export_output.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
match = _NEXTCLOUD_ACCOUNT_RE.match(line)
|
||||
if match:
|
||||
current_id = match.group(1)
|
||||
continue
|
||||
if not current_id or "@" not in line:
|
||||
continue
|
||||
email_match = _EMAIL_RE.search(line)
|
||||
if not email_match:
|
||||
continue
|
||||
accounts.append(NextcloudMailAccount(account_id=current_id, email=email_match.group(0)))
|
||||
current_id = ""
|
||||
return accounts
|
||||
|
||||
|
||||
def _nextcloud_list_mail_accounts(username: str) -> list[NextcloudMailAccount]:
|
||||
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
|
||||
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:export", username])
|
||||
if rc != 0:
|
||||
message = (err or out).strip()
|
||||
lowered = message.lower()
|
||||
if any(token in lowered for token in ("not found", "does not exist", "no such user", "unknown user")):
|
||||
return []
|
||||
raise RuntimeError(f"nextcloud mail export failed for {username}: {message}")
|
||||
return _parse_nextcloud_mail_accounts(out)
|
||||
|
||||
|
||||
def _nextcloud_delete_mail_account(account_id: str) -> None:
|
||||
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
|
||||
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:delete", "-q", account_id])
|
||||
if rc != 0:
|
||||
message = (err or out).strip()
|
||||
raise RuntimeError(f"nextcloud mail delete failed for account {account_id}: {message}")
|
||||
|
||||
|
||||
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
|
||||
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
|
||||
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
|
||||
@ -538,8 +356,6 @@ def main() -> int:
|
||||
),
|
||||
)
|
||||
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
|
||||
parser.add_argument("--skip-mailu", action="store_true", help="Skip Mailu mailbox cleanup.")
|
||||
parser.add_argument("--skip-nextcloud-mail", action="store_true", help="Skip Nextcloud Mail account cleanup.")
|
||||
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
|
||||
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
|
||||
parser.add_argument(
|
||||
@ -548,18 +364,6 @@ def main() -> int:
|
||||
default=[],
|
||||
help="Keycloak usernames that must never be deleted (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect-mailu-email",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Mailu emails that must never be deleted (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect-nextcloud-username",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Nextcloud usernames that must never be touched (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--protect-vaultwarden-email",
|
||||
action="append",
|
||||
@ -572,11 +376,7 @@ def main() -> int:
|
||||
apply = bool(args.apply)
|
||||
expected_confirm = ",".join(prefixes)
|
||||
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
|
||||
protected_mailu = {e.strip() for e in args.protect_mailu_email if e.strip()}
|
||||
protected_nextcloud = {u.strip() for u in args.protect_nextcloud_username if u.strip()}
|
||||
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
|
||||
mailu_domain = os.getenv("MAILU_DOMAIN", "bstein.dev").strip() or "bstein.dev"
|
||||
mailu_db_name = os.getenv("MAILU_DB_NAME", "mailu").strip() or "mailu"
|
||||
|
||||
if apply and args.confirm != expected_confirm:
|
||||
raise SystemExit(
|
||||
@ -588,29 +388,23 @@ def main() -> int:
|
||||
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
|
||||
if protected_keycloak:
|
||||
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
|
||||
if protected_mailu:
|
||||
print("protected mailu emails:", ", ".join(sorted(protected_mailu)))
|
||||
if protected_nextcloud:
|
||||
print("protected nextcloud usernames:", ", ".join(sorted(protected_nextcloud)))
|
||||
if protected_vaultwarden:
|
||||
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
|
||||
print()
|
||||
|
||||
portal_requests: list[PortalRequestRow] = []
|
||||
if not args.skip_portal_db:
|
||||
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
|
||||
portal_requests = _portal_list_requests(portal_db_url, prefixes)
|
||||
print(f"Portal DB: {len(portal_requests)} access_requests matched")
|
||||
for row in portal_requests[:50]:
|
||||
requests = _portal_list_requests(portal_db_url, prefixes)
|
||||
print(f"Portal DB: {len(requests)} access_requests matched")
|
||||
for row in requests[:50]:
|
||||
print(f" {row.request_code}\t{row.status}\t{row.username}")
|
||||
if len(portal_requests) > 50:
|
||||
print(f" ... and {len(portal_requests) - 50} more")
|
||||
if apply and portal_requests:
|
||||
if len(requests) > 50:
|
||||
print(f" ... and {len(requests) - 50} more")
|
||||
if apply and requests:
|
||||
deleted = _portal_delete_requests(portal_db_url, prefixes)
|
||||
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
|
||||
print()
|
||||
|
||||
keycloak_users: list[KeycloakUser] = []
|
||||
if not args.skip_keycloak:
|
||||
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
|
||||
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
|
||||
@ -627,63 +421,18 @@ def main() -> int:
|
||||
if user.username in protected_keycloak:
|
||||
continue
|
||||
found[user.user_id] = user
|
||||
keycloak_users = list(found.values())
|
||||
keycloak_users.sort(key=lambda u: u.username)
|
||||
print(f"Keycloak: {len(keycloak_users)} users matched")
|
||||
for user in keycloak_users[:50]:
|
||||
users = list(found.values())
|
||||
users.sort(key=lambda u: u.username)
|
||||
print(f"Keycloak: {len(users)} users matched")
|
||||
for user in users[:50]:
|
||||
email = user.email or "-"
|
||||
print(f" {user.username}\t{email}\t{user.user_id}")
|
||||
if len(keycloak_users) > 50:
|
||||
print(f" ... and {len(keycloak_users) - 50} more")
|
||||
if apply and keycloak_users:
|
||||
for user in keycloak_users:
|
||||
if len(users) > 50:
|
||||
print(f" ... and {len(users) - 50} more")
|
||||
if apply and users:
|
||||
for user in users:
|
||||
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
|
||||
print(f"Keycloak: deleted {len(keycloak_users)} users.")
|
||||
print()
|
||||
|
||||
if not args.skip_mailu:
|
||||
mailu_users = _mailu_list_users(prefixes, mailu_domain, mailu_db_name, protected_mailu)
|
||||
print(f"Mailu: {len(mailu_users)} mailboxes matched (domain={mailu_domain})")
|
||||
for user in mailu_users[:50]:
|
||||
print(f" {user.email}\t{user.localpart}\t{user.domain}")
|
||||
if len(mailu_users) > 50:
|
||||
print(f" ... and {len(mailu_users) - 50} more")
|
||||
if apply and mailu_users:
|
||||
deleted = _mailu_delete_users(mailu_db_name, [u.email for u in mailu_users])
|
||||
print(f"Mailu: deleted {deleted} mailboxes.")
|
||||
print()
|
||||
|
||||
if not args.skip_nextcloud_mail:
|
||||
nextcloud_usernames = {row.username for row in portal_requests if row.username}
|
||||
nextcloud_usernames.update({u.username for u in keycloak_users if u.username})
|
||||
nextcloud_usernames = {u for u in nextcloud_usernames if _starts_with_any(u, prefixes)}
|
||||
nextcloud_usernames = {u for u in nextcloud_usernames if u not in protected_nextcloud}
|
||||
|
||||
matches: list[tuple[str, NextcloudMailAccount]] = []
|
||||
for username in sorted(nextcloud_usernames):
|
||||
accounts = _nextcloud_list_mail_accounts(username)
|
||||
for account in accounts:
|
||||
email = account.email.strip()
|
||||
if not email:
|
||||
continue
|
||||
if not email.lower().endswith(f"@{mailu_domain.lower()}"):
|
||||
continue
|
||||
localpart = email.split("@", 1)[0]
|
||||
if not _starts_with_any(localpart, prefixes):
|
||||
continue
|
||||
if email in protected_mailu:
|
||||
continue
|
||||
matches.append((username, account))
|
||||
|
||||
print(f"Nextcloud Mail: {len(matches)} accounts matched")
|
||||
for username, account in matches[:50]:
|
||||
print(f" {username}\t{account.account_id}\t{account.email}")
|
||||
if len(matches) > 50:
|
||||
print(f" ... and {len(matches) - 50} more")
|
||||
if apply and matches:
|
||||
for _, account in matches:
|
||||
_nextcloud_delete_mail_account(account.account_id)
|
||||
print(f"Nextcloud Mail: deleted {len(matches)} accounts.")
|
||||
print(f"Keycloak: deleted {len(users)} users.")
|
||||
print()
|
||||
|
||||
if not args.skip_vaultwarden:
|
||||
|
||||
@ -55,11 +55,11 @@ class _FakeResponse:
|
||||
|
||||
|
||||
class _FakeSession:
|
||||
def __init__(self, put_resp, get_resps):
|
||||
def __init__(self, put_resp, get_resp):
|
||||
self.put_resp = put_resp
|
||||
self.get_resps = list(get_resps)
|
||||
self.get_resp = get_resp
|
||||
self.put_called = False
|
||||
self.get_calls = 0
|
||||
self.get_called = False
|
||||
|
||||
def post(self, *args, **kwargs):
|
||||
return _FakeResponse({"access_token": "dummy"})
|
||||
@ -69,26 +69,22 @@ class _FakeSession:
|
||||
return self.put_resp
|
||||
|
||||
def get(self, *args, **kwargs):
|
||||
self.get_calls += 1
|
||||
if self.get_resps:
|
||||
return self.get_resps.pop(0)
|
||||
return _FakeResponse({})
|
||||
self.get_called = True
|
||||
return self.get_resp
|
||||
|
||||
|
||||
def test_kc_update_attributes_succeeds(monkeypatch):
|
||||
sync = load_sync_module(monkeypatch)
|
||||
current_resp = _FakeResponse({"attributes": {}})
|
||||
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
|
||||
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, ok_resp])
|
||||
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
|
||||
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
||||
assert sync.SESSION.put_called and sync.SESSION.get_calls == 2
|
||||
assert sync.SESSION.put_called and sync.SESSION.get_called
|
||||
|
||||
|
||||
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
|
||||
sync = load_sync_module(monkeypatch)
|
||||
current_resp = _FakeResponse({"attributes": {}})
|
||||
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
|
||||
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, missing_attr_resp])
|
||||
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
|
||||
with pytest.raises(Exception):
|
||||
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
||||
|
||||
@ -148,25 +144,9 @@ def test_main_generates_password_and_upserts(monkeypatch):
|
||||
sync = load_sync_module(monkeypatch)
|
||||
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
|
||||
users = [
|
||||
{
|
||||
"id": "u1",
|
||||
"username": "user1",
|
||||
"email": "user1@example.com",
|
||||
"attributes": {"mailu_enabled": ["true"]},
|
||||
},
|
||||
{
|
||||
"id": "u2",
|
||||
"username": "user2",
|
||||
"email": "user2@example.com",
|
||||
"attributes": {"mailu_app_password": ["keepme"], "mailu_enabled": ["true"]},
|
||||
},
|
||||
{
|
||||
"id": "u3",
|
||||
"username": "user3",
|
||||
"email": "user3@example.com",
|
||||
"attributes": {"mailu_email": ["user3@example.com"]},
|
||||
},
|
||||
{"id": "u4", "username": "user4", "email": "user4@other.com", "attributes": {}},
|
||||
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
|
||||
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
|
||||
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
|
||||
]
|
||||
updated = []
|
||||
|
||||
@ -205,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
|
||||
|
||||
sync.main()
|
||||
|
||||
# Only mail-enabled users (or legacy users with a mailbox) are synced and backfilled.
|
||||
# Always backfill mailu_email, even if Keycloak recovery email is external.
|
||||
assert len(updated) == 3
|
||||
assert conns and len(conns[0]._cursor.executions) == 3
|
||||
|
||||
@ -20,9 +20,8 @@ spec:
|
||||
labels:
|
||||
app: ollama
|
||||
annotations:
|
||||
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
|
||||
ai.bstein.dev/gpu: GPU pool (titan-22/24)
|
||||
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
|
||||
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
||||
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
@ -32,6 +31,8 @@ spec:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
runtimeClassName: nvidia
|
||||
@ -41,7 +42,7 @@ spec:
|
||||
claimName: ollama-models
|
||||
initContainers:
|
||||
- name: warm-model
|
||||
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
|
||||
image: ollama/ollama:latest
|
||||
env:
|
||||
- name: OLLAMA_HOST
|
||||
value: 0.0.0.0
|
||||
@ -52,7 +53,7 @@ spec:
|
||||
- name: OLLAMA_MODELS
|
||||
value: /root/.ollama
|
||||
- name: OLLAMA_MODEL
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
value: qwen2.5-coder:7b-instruct-q4_0
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
@ -67,14 +68,14 @@ spec:
|
||||
mountPath: /root/.ollama
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
nvidia.com/gpu.shared: 1
|
||||
containers:
|
||||
- name: ollama
|
||||
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
|
||||
image: ollama/ollama:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: http
|
||||
@ -95,10 +96,10 @@ spec:
|
||||
mountPath: /root/.ollama
|
||||
resources:
|
||||
requests:
|
||||
cpu: "4"
|
||||
memory: 16Gi
|
||||
cpu: "2"
|
||||
memory: 8Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
cpu: "8"
|
||||
memory: 24Gi
|
||||
cpu: "4"
|
||||
memory: 12Gi
|
||||
nvidia.com/gpu.shared: 1
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user