Compare commits
No commits in common. "main" and "feature/postgres-migration" have entirely different histories.
main
...
feature/po
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,5 +6,3 @@ __pycache__/
|
|||||||
*.py[cod]
|
*.py[cod]
|
||||||
.pytest_cache
|
.pytest_cache
|
||||||
.venv
|
.venv
|
||||||
.venv-ci
|
|
||||||
tmp/
|
|
||||||
|
|||||||
77
Jenkinsfile
vendored
77
Jenkinsfile
vendored
@ -1,77 +0,0 @@
|
|||||||
// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
|
|
||||||
pipeline {
|
|
||||||
agent {
|
|
||||||
kubernetes {
|
|
||||||
defaultContainer 'python'
|
|
||||||
yaml """
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
spec:
|
|
||||||
nodeSelector:
|
|
||||||
hardware: rpi5
|
|
||||||
kubernetes.io/arch: arm64
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
containers:
|
|
||||||
- name: python
|
|
||||||
image: python:3.12-slim
|
|
||||||
command:
|
|
||||||
- cat
|
|
||||||
tty: true
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
environment {
|
|
||||||
PIP_DISABLE_PIP_VERSION_CHECK = '1'
|
|
||||||
PYTHONUNBUFFERED = '1'
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Checkout') {
|
|
||||||
steps {
|
|
||||||
checkout scm
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Install deps') {
|
|
||||||
steps {
|
|
||||||
sh 'pip install --no-cache-dir -r ci/requirements.txt'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Glue tests') {
|
|
||||||
steps {
|
|
||||||
sh 'pytest -q ci/tests/glue'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Resolve Flux branch') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
env.FLUX_BRANCH = sh(
|
|
||||||
returnStdout: true,
|
|
||||||
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
|
|
||||||
).trim()
|
|
||||||
if (!env.FLUX_BRANCH) {
|
|
||||||
error('Flux branch not found in gotk-sync.yaml')
|
|
||||||
}
|
|
||||||
echo "Flux branch: ${env.FLUX_BRANCH}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Promote') {
|
|
||||||
when {
|
|
||||||
expression {
|
|
||||||
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
|
|
||||||
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
|
|
||||||
}
|
|
||||||
}
|
|
||||||
steps {
|
|
||||||
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
|
|
||||||
sh '''
|
|
||||||
set +x
|
|
||||||
git config user.email "jenkins@bstein.dev"
|
|
||||||
git config user.name "jenkins"
|
|
||||||
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
|
|
||||||
git push origin HEAD:${FLUX_BRANCH}
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,76 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent {
|
|
||||||
kubernetes {
|
|
||||||
defaultContainer 'python'
|
|
||||||
yaml """
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
spec:
|
|
||||||
nodeSelector:
|
|
||||||
hardware: rpi5
|
|
||||||
kubernetes.io/arch: arm64
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
containers:
|
|
||||||
- name: python
|
|
||||||
image: python:3.12-slim
|
|
||||||
command:
|
|
||||||
- cat
|
|
||||||
tty: true
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
environment {
|
|
||||||
PIP_DISABLE_PIP_VERSION_CHECK = '1'
|
|
||||||
PYTHONUNBUFFERED = '1'
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Checkout') {
|
|
||||||
steps {
|
|
||||||
checkout scm
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Install deps') {
|
|
||||||
steps {
|
|
||||||
sh 'pip install --no-cache-dir -r ci/requirements.txt'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Glue tests') {
|
|
||||||
steps {
|
|
||||||
sh 'pytest -q ci/tests/glue'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Resolve Flux branch') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
env.FLUX_BRANCH = sh(
|
|
||||||
returnStdout: true,
|
|
||||||
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
|
|
||||||
).trim()
|
|
||||||
if (!env.FLUX_BRANCH) {
|
|
||||||
error('Flux branch not found in gotk-sync.yaml')
|
|
||||||
}
|
|
||||||
echo "Flux branch: ${env.FLUX_BRANCH}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Promote') {
|
|
||||||
when {
|
|
||||||
expression {
|
|
||||||
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
|
|
||||||
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
|
|
||||||
}
|
|
||||||
}
|
|
||||||
steps {
|
|
||||||
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
|
|
||||||
sh '''
|
|
||||||
set +x
|
|
||||||
git config user.email "jenkins@bstein.dev"
|
|
||||||
git config user.name "jenkins"
|
|
||||||
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
|
|
||||||
git push origin HEAD:${FLUX_BRANCH}
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
pytest==8.3.4
|
|
||||||
kubernetes==30.1.0
|
|
||||||
PyYAML==6.0.2
|
|
||||||
requests==2.32.3
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
max_success_age_hours: 48
|
|
||||||
allow_suspended:
|
|
||||||
- bstein-dev-home/vaultwarden-cred-sync
|
|
||||||
- comms/othrys-room-reset
|
|
||||||
- comms/pin-othrys-invite
|
|
||||||
- comms/seed-othrys-room
|
|
||||||
- finance/firefly-user-sync
|
|
||||||
- health/wger-admin-ensure
|
|
||||||
- health/wger-user-sync
|
|
||||||
- mailu-mailserver/mailu-sync-nightly
|
|
||||||
- nextcloud/nextcloud-mail-sync
|
|
||||||
ariadne_schedule_tasks:
|
|
||||||
- schedule.mailu_sync
|
|
||||||
- schedule.nextcloud_sync
|
|
||||||
- schedule.vaultwarden_sync
|
|
||||||
- schedule.wger_admin
|
|
||||||
@ -1,46 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
from kubernetes import client, config
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG_PATH = Path(__file__).with_name("config.yaml")
|
|
||||||
|
|
||||||
|
|
||||||
def _load_config() -> dict:
|
|
||||||
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
|
|
||||||
return yaml.safe_load(handle) or {}
|
|
||||||
|
|
||||||
|
|
||||||
def _load_kube():
|
|
||||||
try:
|
|
||||||
config.load_incluster_config()
|
|
||||||
except config.ConfigException:
|
|
||||||
config.load_kube_config()
|
|
||||||
|
|
||||||
|
|
||||||
def test_glue_cronjobs_recent_success():
|
|
||||||
cfg = _load_config()
|
|
||||||
max_age_hours = int(cfg.get("max_success_age_hours", 48))
|
|
||||||
allow_suspended = set(cfg.get("allow_suspended", []))
|
|
||||||
|
|
||||||
_load_kube()
|
|
||||||
batch = client.BatchV1Api()
|
|
||||||
cronjobs = batch.list_cron_job_for_all_namespaces(label_selector="atlas.bstein.dev/glue=true").items
|
|
||||||
|
|
||||||
assert cronjobs, "No glue cronjobs found with atlas.bstein.dev/glue=true"
|
|
||||||
|
|
||||||
now = datetime.now(timezone.utc)
|
|
||||||
for cronjob in cronjobs:
|
|
||||||
name = f"{cronjob.metadata.namespace}/{cronjob.metadata.name}"
|
|
||||||
if cronjob.spec.suspend:
|
|
||||||
assert name in allow_suspended, f"{name} is suspended but not in allow_suspended"
|
|
||||||
continue
|
|
||||||
|
|
||||||
last_success = cronjob.status.last_successful_time
|
|
||||||
assert last_success is not None, f"{name} has no lastSuccessfulTime"
|
|
||||||
age_hours = (now - last_success).total_seconds() / 3600
|
|
||||||
assert age_hours <= max_age_hours, f"{name} last success {age_hours:.1f}h ago"
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
|
|
||||||
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
|
|
||||||
CONFIG_PATH = Path(__file__).with_name("config.yaml")
|
|
||||||
|
|
||||||
|
|
||||||
def _load_config() -> dict:
|
|
||||||
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
|
|
||||||
return yaml.safe_load(handle) or {}
|
|
||||||
|
|
||||||
|
|
||||||
def _query(promql: str) -> list[dict]:
|
|
||||||
response = requests.get(f"{VM_URL}/api/v1/query", params={"query": promql}, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
payload = response.json()
|
|
||||||
return payload.get("data", {}).get("result", [])
|
|
||||||
|
|
||||||
|
|
||||||
def test_glue_metrics_present():
|
|
||||||
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
|
|
||||||
assert series, "No glue cronjob label series found"
|
|
||||||
|
|
||||||
|
|
||||||
def test_glue_metrics_success_join():
|
|
||||||
query = (
|
|
||||||
"kube_cronjob_status_last_successful_time "
|
|
||||||
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
|
|
||||||
)
|
|
||||||
series = _query(query)
|
|
||||||
assert series, "No glue cronjob last success series found"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ariadne_schedule_metrics_present():
|
|
||||||
cfg = _load_config()
|
|
||||||
expected = cfg.get("ariadne_schedule_tasks", [])
|
|
||||||
if not expected:
|
|
||||||
return
|
|
||||||
series = _query("ariadne_schedule_next_run_timestamp_seconds")
|
|
||||||
tasks = {item.get("metric", {}).get("task") for item in series}
|
|
||||||
missing = [task for task in expected if task not in tasks]
|
|
||||||
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
|
|
||||||
13
clusters/atlas/applications/kustomization.yaml
Normal file
13
clusters/atlas/applications/kustomization.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# clusters/atlas/applications/kustomization.yaml
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ../../services/crypto
|
||||||
|
- ../../services/gitea
|
||||||
|
- ../../services/jellyfin
|
||||||
|
- ../../services/comms
|
||||||
|
- ../../services/monitoring
|
||||||
|
- ../../services/logging
|
||||||
|
- ../../services/pegasus
|
||||||
|
- ../../services/vault
|
||||||
|
- ../../services/bstein-dev-home
|
||||||
@ -1,17 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: bstein-dev-home-migrations
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 10m
|
|
||||||
path: ./services/bstein-dev-home/oneoffs/migrations
|
|
||||||
prune: true
|
|
||||||
force: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
targetNamespace: bstein-dev-home
|
|
||||||
wait: false
|
|
||||||
suspend: true
|
|
||||||
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
|
|||||||
kind: ImageUpdateAutomation
|
kind: ImageUpdateAutomation
|
||||||
metadata:
|
metadata:
|
||||||
name: bstein-dev-home
|
name: bstein-dev-home
|
||||||
namespace: bstein-dev-home
|
namespace: flux-system
|
||||||
spec:
|
spec:
|
||||||
interval: 1m0s
|
interval: 1m0s
|
||||||
sourceRef:
|
sourceRef:
|
||||||
@ -13,14 +13,14 @@ spec:
|
|||||||
git:
|
git:
|
||||||
checkout:
|
checkout:
|
||||||
ref:
|
ref:
|
||||||
branch: feature/ariadne
|
branch: main
|
||||||
commit:
|
commit:
|
||||||
author:
|
author:
|
||||||
email: ops@bstein.dev
|
email: ops@bstein.dev
|
||||||
name: flux-bot
|
name: flux-bot
|
||||||
messageTemplate: "chore(bstein-dev-home): automated image update"
|
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
|
||||||
push:
|
push:
|
||||||
branch: feature/ariadne
|
branch: main
|
||||||
update:
|
update:
|
||||||
strategy: Setters
|
strategy: Setters
|
||||||
path: services/bstein-dev-home
|
path: services/bstein-dev-home
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# clusters/atlas/flux-system/applications/comms/kustomization.yaml
|
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
metadata:
|
metadata:
|
||||||
|
|||||||
@ -1,24 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/applications/finance/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: finance
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 10m
|
|
||||||
path: ./services/finance
|
|
||||||
prune: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
targetNamespace: finance
|
|
||||||
healthChecks:
|
|
||||||
- apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: actual-budget
|
|
||||||
namespace: finance
|
|
||||||
- apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: firefly
|
|
||||||
namespace: finance
|
|
||||||
wait: false
|
|
||||||
@ -13,6 +13,11 @@ spec:
|
|||||||
kind: GitRepository
|
kind: GitRepository
|
||||||
name: flux-system
|
name: flux-system
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
|
healthChecks:
|
||||||
|
- apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
name: harbor
|
||||||
|
namespace: harbor
|
||||||
wait: false
|
wait: false
|
||||||
dependsOn:
|
dependsOn:
|
||||||
- name: core
|
- name: core
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/applications/health/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: health
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 10m
|
|
||||||
path: ./services/health
|
|
||||||
prune: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
targetNamespace: health
|
|
||||||
dependsOn:
|
|
||||||
- name: keycloak
|
|
||||||
- name: postgres
|
|
||||||
- name: traefik
|
|
||||||
- name: vault
|
|
||||||
healthChecks:
|
|
||||||
- apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: wger
|
|
||||||
namespace: health
|
|
||||||
wait: false
|
|
||||||
@ -12,12 +12,10 @@ resources:
|
|||||||
- pegasus/image-automation.yaml
|
- pegasus/image-automation.yaml
|
||||||
- bstein-dev-home/kustomization.yaml
|
- bstein-dev-home/kustomization.yaml
|
||||||
- bstein-dev-home/image-automation.yaml
|
- bstein-dev-home/image-automation.yaml
|
||||||
- bstein-dev-home-migrations/kustomization.yaml
|
|
||||||
- harbor/kustomization.yaml
|
- harbor/kustomization.yaml
|
||||||
- harbor/image-automation.yaml
|
- harbor/image-automation.yaml
|
||||||
- jellyfin/kustomization.yaml
|
- jellyfin/kustomization.yaml
|
||||||
- xmr-miner/kustomization.yaml
|
- xmr-miner/kustomization.yaml
|
||||||
- wallet-monero-temp/kustomization.yaml
|
|
||||||
- sui-metrics/kustomization.yaml
|
- sui-metrics/kustomization.yaml
|
||||||
- openldap/kustomization.yaml
|
- openldap/kustomization.yaml
|
||||||
- keycloak/kustomization.yaml
|
- keycloak/kustomization.yaml
|
||||||
@ -29,5 +27,3 @@ resources:
|
|||||||
- nextcloud-mail-sync/kustomization.yaml
|
- nextcloud-mail-sync/kustomization.yaml
|
||||||
- outline/kustomization.yaml
|
- outline/kustomization.yaml
|
||||||
- planka/kustomization.yaml
|
- planka/kustomization.yaml
|
||||||
- finance/kustomization.yaml
|
|
||||||
- health/kustomization.yaml
|
|
||||||
|
|||||||
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
|
|||||||
kind: ImageUpdateAutomation
|
kind: ImageUpdateAutomation
|
||||||
metadata:
|
metadata:
|
||||||
name: pegasus
|
name: pegasus
|
||||||
namespace: jellyfin
|
namespace: flux-system
|
||||||
spec:
|
spec:
|
||||||
interval: 1m0s
|
interval: 1m0s
|
||||||
sourceRef:
|
sourceRef:
|
||||||
|
|||||||
@ -1,19 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: wallet-monero-temp
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 10m
|
|
||||||
path: ./services/crypto/wallet-monero-temp
|
|
||||||
targetNamespace: crypto
|
|
||||||
prune: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
dependsOn:
|
|
||||||
- name: crypto
|
|
||||||
- name: xmr-miner
|
|
||||||
wait: true
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# clusters/atlas/flux-system/gotk-components.yaml
|
|
||||||
---
|
---
|
||||||
# This manifest was generated by flux. DO NOT EDIT.
|
# This manifest was generated by flux. DO NOT EDIT.
|
||||||
# Flux Version: v2.7.5
|
# Flux Version: v2.7.5
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# clusters/atlas/flux-system/gotk-sync.yaml
|
|
||||||
# This manifest was generated by flux. DO NOT EDIT.
|
# This manifest was generated by flux. DO NOT EDIT.
|
||||||
---
|
---
|
||||||
apiVersion: source.toolkit.fluxcd.io/v1
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
@ -9,7 +8,7 @@ metadata:
|
|||||||
spec:
|
spec:
|
||||||
interval: 1m0s
|
interval: 1m0s
|
||||||
ref:
|
ref:
|
||||||
branch: feature/ariadne
|
branch: feature/sso-hardening
|
||||||
secretRef:
|
secretRef:
|
||||||
name: flux-system-gitea
|
name: flux-system-gitea
|
||||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||||
|
|||||||
@ -1,17 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
path: ./infrastructure/cert-manager/cleanup
|
|
||||||
prune: true
|
|
||||||
force: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
targetNamespace: cert-manager
|
|
||||||
wait: true
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: cert-manager
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
path: ./infrastructure/cert-manager
|
|
||||||
prune: true
|
|
||||||
force: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
targetNamespace: cert-manager
|
|
||||||
dependsOn:
|
|
||||||
- name: helm
|
|
||||||
wait: true
|
|
||||||
@ -4,17 +4,12 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- core/kustomization.yaml
|
- core/kustomization.yaml
|
||||||
- helm/kustomization.yaml
|
- helm/kustomization.yaml
|
||||||
- cert-manager/kustomization.yaml
|
|
||||||
- metallb/kustomization.yaml
|
- metallb/kustomization.yaml
|
||||||
- traefik/kustomization.yaml
|
- traefik/kustomization.yaml
|
||||||
- gitops-ui/kustomization.yaml
|
- gitops-ui/kustomization.yaml
|
||||||
- monitoring/kustomization.yaml
|
- monitoring/kustomization.yaml
|
||||||
- logging/kustomization.yaml
|
- logging/kustomization.yaml
|
||||||
- maintenance/kustomization.yaml
|
- maintenance/kustomization.yaml
|
||||||
- maintenance/image-automation.yaml
|
|
||||||
- longhorn-adopt/kustomization.yaml
|
|
||||||
- longhorn/kustomization.yaml
|
|
||||||
- longhorn-ui/kustomization.yaml
|
- longhorn-ui/kustomization.yaml
|
||||||
- postgres/kustomization.yaml
|
- postgres/kustomization.yaml
|
||||||
- ../platform/vault-csi/kustomization.yaml
|
- ../platform/vault-csi/kustomization.yaml
|
||||||
- ../platform/vault-injector/kustomization.yaml
|
|
||||||
|
|||||||
@ -1,17 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: longhorn-adopt
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
path: ./infrastructure/longhorn/adopt
|
|
||||||
prune: true
|
|
||||||
force: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
targetNamespace: longhorn-system
|
|
||||||
wait: true
|
|
||||||
@ -15,5 +15,4 @@ spec:
|
|||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
dependsOn:
|
dependsOn:
|
||||||
- name: core
|
- name: core
|
||||||
- name: longhorn
|
|
||||||
wait: true
|
wait: true
|
||||||
|
|||||||
@ -1,20 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: longhorn
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
path: ./infrastructure/longhorn/core
|
|
||||||
prune: true
|
|
||||||
force: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
targetNamespace: longhorn-system
|
|
||||||
dependsOn:
|
|
||||||
- name: helm
|
|
||||||
- name: longhorn-adopt
|
|
||||||
wait: false
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1
|
|
||||||
kind: ImageUpdateAutomation
|
|
||||||
metadata:
|
|
||||||
name: maintenance
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
interval: 1m0s
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
git:
|
|
||||||
checkout:
|
|
||||||
ref:
|
|
||||||
branch: feature/ariadne
|
|
||||||
commit:
|
|
||||||
author:
|
|
||||||
email: ops@bstein.dev
|
|
||||||
name: flux-bot
|
|
||||||
messageTemplate: "chore(maintenance): automated image update"
|
|
||||||
push:
|
|
||||||
branch: feature/ariadne
|
|
||||||
update:
|
|
||||||
strategy: Setters
|
|
||||||
path: services/maintenance
|
|
||||||
@ -8,7 +8,6 @@ spec:
|
|||||||
interval: 10m
|
interval: 10m
|
||||||
path: ./services/maintenance
|
path: ./services/maintenance
|
||||||
prune: true
|
prune: true
|
||||||
force: true
|
|
||||||
sourceRef:
|
sourceRef:
|
||||||
kind: GitRepository
|
kind: GitRepository
|
||||||
name: flux-system
|
name: flux-system
|
||||||
|
|||||||
@ -1,16 +0,0 @@
|
|||||||
# clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
|
|
||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: vault-injector
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
path: ./infrastructure/vault-injector
|
|
||||||
targetNamespace: vault
|
|
||||||
prune: true
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
namespace: flux-system
|
|
||||||
wait: true
|
|
||||||
8
clusters/atlas/platform/kustomization.yaml
Normal file
8
clusters/atlas/platform/kustomization.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# clusters/atlas/platform/kustomization.yaml
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ../../../infrastructure/modules/base
|
||||||
|
- ../../../infrastructure/modules/profiles/atlas-ha
|
||||||
|
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
|
||||||
|
- ../../../infrastructure/metallb
|
||||||
@ -1,5 +0,0 @@
|
|||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir requests psycopg2-binary
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
FROM registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
|
|
||||||
|
|
||||||
USER root
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
USER harbor
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/harbor/entrypoint.sh"]
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
FROM registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
|
|
||||||
|
|
||||||
USER root
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
USER harbor
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/harbor/entrypoint.sh"]
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
FROM registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
|
|
||||||
|
|
||||||
USER root
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
USER harbor
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/home/harbor/entrypoint.sh"]
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
FROM registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
|
|
||||||
|
|
||||||
USER root
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
USER harbor
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/home/harbor/start.sh"]
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
|
|
||||||
|
|
||||||
FROM alpine:3.20
|
|
||||||
RUN apk add --no-cache ca-certificates
|
|
||||||
COPY --from=base /lk-jwt-service /lk-jwt-service
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/lk-jwt-service"]
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
|
|
||||||
|
|
||||||
FROM alpine:3.20
|
|
||||||
RUN apk add --no-cache ca-certificates
|
|
||||||
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/bin/oauth2-proxy"]
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
|
|
||||||
|
|
||||||
FROM alpine:3.20
|
|
||||||
RUN apk add --no-cache ca-certificates
|
|
||||||
COPY --from=base /pegasus /pegasus
|
|
||||||
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
|
|
||||||
RUN chmod 0755 /entrypoint.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
||||||
CMD ["/pegasus"]
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
if [ -n "${VAULT_ENV_FILE:-}" ]; then
|
|
||||||
if [ -f "${VAULT_ENV_FILE}" ]; then
|
|
||||||
# shellcheck disable=SC1090
|
|
||||||
. "${VAULT_ENV_FILE}"
|
|
||||||
else
|
|
||||||
echo "Vault env file not found: ${VAULT_ENV_FILE}" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -n "${VAULT_COPY_FILES:-}" ]; then
|
|
||||||
old_ifs="$IFS"
|
|
||||||
IFS=','
|
|
||||||
for pair in ${VAULT_COPY_FILES}; do
|
|
||||||
src="${pair%%:*}"
|
|
||||||
dest="${pair#*:}"
|
|
||||||
if [ -z "${src}" ] || [ -z "${dest}" ]; then
|
|
||||||
echo "Vault copy entry malformed: ${pair}" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ ! -f "${src}" ]; then
|
|
||||||
echo "Vault file not found: ${src}" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
mkdir -p "$(dirname "${dest}")"
|
|
||||||
cp "${src}" "${dest}"
|
|
||||||
done
|
|
||||||
IFS="$old_ifs"
|
|
||||||
fi
|
|
||||||
|
|
||||||
exec "$@"
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: cert-manager-cleanup-2
|
|
||||||
namespace: cert-manager
|
|
||||||
spec:
|
|
||||||
backoffLimit: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
serviceAccountName: cert-manager-cleanup
|
|
||||||
restartPolicy: Never
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: node-role.kubernetes.io/worker
|
|
||||||
operator: Exists
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: kubernetes.io/arch
|
|
||||||
operator: In
|
|
||||||
values: ["arm64"]
|
|
||||||
containers:
|
|
||||||
- name: cleanup
|
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
|
||||||
command: ["/usr/bin/env", "bash"]
|
|
||||||
args: ["/scripts/cert_manager_cleanup.sh"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: script
|
|
||||||
mountPath: /scripts
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: script
|
|
||||||
configMap:
|
|
||||||
name: cert-manager-cleanup-script
|
|
||||||
defaultMode: 0555
|
|
||||||
@ -1,58 +0,0 @@
|
|||||||
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
namespace: cert-manager
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- pods
|
|
||||||
- services
|
|
||||||
- endpoints
|
|
||||||
- configmaps
|
|
||||||
- secrets
|
|
||||||
- serviceaccounts
|
|
||||||
verbs: ["get", "list", "watch", "delete"]
|
|
||||||
- apiGroups: ["apps"]
|
|
||||||
resources:
|
|
||||||
- deployments
|
|
||||||
- daemonsets
|
|
||||||
- statefulsets
|
|
||||||
- replicasets
|
|
||||||
verbs: ["get", "list", "watch", "delete"]
|
|
||||||
- apiGroups: ["batch"]
|
|
||||||
resources:
|
|
||||||
- jobs
|
|
||||||
- cronjobs
|
|
||||||
verbs: ["get", "list", "watch", "delete"]
|
|
||||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- roles
|
|
||||||
- rolebindings
|
|
||||||
- clusterroles
|
|
||||||
- clusterrolebindings
|
|
||||||
verbs: ["get", "list", "watch", "delete"]
|
|
||||||
- apiGroups: ["admissionregistration.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- validatingwebhookconfigurations
|
|
||||||
- mutatingwebhookconfigurations
|
|
||||||
verbs: ["get", "list", "watch", "delete"]
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: cert-manager-cleanup
|
|
||||||
namespace: cert-manager
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
# infrastructure/cert-manager/cleanup/kustomization.yaml
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- namespace.yaml
|
|
||||||
- cert-manager-cleanup-rbac.yaml
|
|
||||||
- cert-manager-cleanup-job.yaml
|
|
||||||
|
|
||||||
configMapGenerator:
|
|
||||||
- name: cert-manager-cleanup-script
|
|
||||||
namespace: cert-manager
|
|
||||||
files:
|
|
||||||
- cert_manager_cleanup.sh=scripts/cert_manager_cleanup.sh
|
|
||||||
options:
|
|
||||||
disableNameSuffixHash: true
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
# infrastructure/cert-manager/cleanup/namespace.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: cert-manager
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
namespace="cert-manager"
|
|
||||||
selectors=(
|
|
||||||
"app.kubernetes.io/name=cert-manager"
|
|
||||||
"app.kubernetes.io/instance=cert-manager"
|
|
||||||
"app.kubernetes.io/instance=certmanager-prod"
|
|
||||||
)
|
|
||||||
|
|
||||||
delete_namespaced() {
|
|
||||||
local selector="$1"
|
|
||||||
kubectl -n "${namespace}" delete deployment,daemonset,statefulset,replicaset \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
kubectl -n "${namespace}" delete pod,service,endpoints,serviceaccount,configmap,secret \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
kubectl -n "${namespace}" delete role,rolebinding \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
kubectl -n "${namespace}" delete job,cronjob \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
}
|
|
||||||
|
|
||||||
delete_cluster_scoped() {
|
|
||||||
local selector="$1"
|
|
||||||
kubectl delete clusterrole,clusterrolebinding \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
kubectl delete mutatingwebhookconfiguration,validatingwebhookconfiguration \
|
|
||||||
--selector "${selector}" --ignore-not-found --wait=false
|
|
||||||
}
|
|
||||||
|
|
||||||
for selector in "${selectors[@]}"; do
|
|
||||||
delete_namespaced "${selector}"
|
|
||||||
delete_cluster_scoped "${selector}"
|
|
||||||
done
|
|
||||||
|
|
||||||
kubectl delete mutatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
|
|
||||||
kubectl delete validatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
|
|
||||||
@ -1,67 +0,0 @@
|
|||||||
# infrastructure/cert-manager/helmrelease.yaml
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: cert-manager
|
|
||||||
namespace: cert-manager
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: cert-manager
|
|
||||||
version: v1.17.0
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: jetstack
|
|
||||||
namespace: flux-system
|
|
||||||
install:
|
|
||||||
crds: CreateReplace
|
|
||||||
remediation: { retries: 3 }
|
|
||||||
timeout: 10m
|
|
||||||
upgrade:
|
|
||||||
crds: CreateReplace
|
|
||||||
remediation:
|
|
||||||
retries: 3
|
|
||||||
remediateLastFailure: true
|
|
||||||
cleanupOnFail: true
|
|
||||||
timeout: 10m
|
|
||||||
values:
|
|
||||||
installCRDs: true
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- rpi5
|
|
||||||
- rpi4
|
|
||||||
webhook:
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- rpi5
|
|
||||||
- rpi4
|
|
||||||
cainjector:
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- rpi5
|
|
||||||
- rpi4
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# infrastructure/cert-manager/kustomization.yaml
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- namespace.yaml
|
|
||||||
- helmrelease.yaml
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
# infrastructure/cert-manager/namespace.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: cert-manager
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
# infrastructure/core/coredns-custom.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: coredns-custom
|
|
||||||
namespace: kube-system
|
|
||||||
data:
|
|
||||||
bstein-dev.server: |
|
|
||||||
bstein.dev:53 {
|
|
||||||
errors
|
|
||||||
cache 30
|
|
||||||
hosts {
|
|
||||||
192.168.22.9 alerts.bstein.dev
|
|
||||||
192.168.22.9 auth.bstein.dev
|
|
||||||
192.168.22.9 bstein.dev
|
|
||||||
10.43.6.87 budget.bstein.dev
|
|
||||||
192.168.22.9 call.live.bstein.dev
|
|
||||||
192.168.22.9 cd.bstein.dev
|
|
||||||
192.168.22.9 chat.ai.bstein.dev
|
|
||||||
192.168.22.9 ci.bstein.dev
|
|
||||||
192.168.22.9 cloud.bstein.dev
|
|
||||||
192.168.22.9 health.bstein.dev
|
|
||||||
192.168.22.9 kit.live.bstein.dev
|
|
||||||
192.168.22.9 live.bstein.dev
|
|
||||||
192.168.22.9 logs.bstein.dev
|
|
||||||
192.168.22.9 longhorn.bstein.dev
|
|
||||||
192.168.22.4 mail.bstein.dev
|
|
||||||
192.168.22.9 matrix.live.bstein.dev
|
|
||||||
192.168.22.9 metrics.bstein.dev
|
|
||||||
192.168.22.9 monero.bstein.dev
|
|
||||||
10.43.6.87 money.bstein.dev
|
|
||||||
192.168.22.9 notes.bstein.dev
|
|
||||||
192.168.22.9 office.bstein.dev
|
|
||||||
192.168.22.9 pegasus.bstein.dev
|
|
||||||
3.136.224.193 pm-bounces.bstein.dev
|
|
||||||
3.150.68.49 pm-bounces.bstein.dev
|
|
||||||
18.189.137.81 pm-bounces.bstein.dev
|
|
||||||
192.168.22.9 registry.bstein.dev
|
|
||||||
192.168.22.9 scm.bstein.dev
|
|
||||||
192.168.22.9 secret.bstein.dev
|
|
||||||
192.168.22.9 sso.bstein.dev
|
|
||||||
192.168.22.9 stream.bstein.dev
|
|
||||||
192.168.22.9 tasks.bstein.dev
|
|
||||||
192.168.22.9 vault.bstein.dev
|
|
||||||
fallthrough
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,141 +0,0 @@
|
|||||||
# infrastructure/core/coredns-deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: coredns
|
|
||||||
namespace: kube-system
|
|
||||||
labels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
kubernetes.io/name: CoreDNS
|
|
||||||
spec:
|
|
||||||
progressDeadlineSeconds: 600
|
|
||||||
replicas: 2
|
|
||||||
revisionHistoryLimit: 0
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
strategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 25%
|
|
||||||
maxUnavailable: 1
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: coredns
|
|
||||||
image: registry.bstein.dev/infra/coredns:1.12.1
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
args:
|
|
||||||
- -conf
|
|
||||||
- /etc/coredns/Corefile
|
|
||||||
ports:
|
|
||||||
- containerPort: 53
|
|
||||||
name: dns
|
|
||||||
protocol: UDP
|
|
||||||
- containerPort: 53
|
|
||||||
name: dns-tcp
|
|
||||||
protocol: TCP
|
|
||||||
- containerPort: 9153
|
|
||||||
name: metrics
|
|
||||||
protocol: TCP
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 60
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 1
|
|
||||||
successThreshold: 1
|
|
||||||
failureThreshold: 3
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: 8181
|
|
||||||
scheme: HTTP
|
|
||||||
periodSeconds: 2
|
|
||||||
timeoutSeconds: 1
|
|
||||||
successThreshold: 1
|
|
||||||
failureThreshold: 3
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 170Mi
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 70Mi
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
add:
|
|
||||||
- NET_BIND_SERVICE
|
|
||||||
drop:
|
|
||||||
- all
|
|
||||||
readOnlyRootFilesystem: true
|
|
||||||
volumeMounts:
|
|
||||||
- name: config-volume
|
|
||||||
mountPath: /etc/coredns
|
|
||||||
readOnly: true
|
|
||||||
- name: custom-config-volume
|
|
||||||
mountPath: /etc/coredns/custom
|
|
||||||
readOnly: true
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- rpi5
|
|
||||||
- rpi4
|
|
||||||
- key: node-role.kubernetes.io/worker
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "true"
|
|
||||||
dnsPolicy: Default
|
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/os: linux
|
|
||||||
priorityClassName: system-cluster-critical
|
|
||||||
restartPolicy: Always
|
|
||||||
schedulerName: default-scheduler
|
|
||||||
serviceAccountName: coredns
|
|
||||||
tolerations:
|
|
||||||
- key: CriticalAddonsOnly
|
|
||||||
operator: Exists
|
|
||||||
- key: node-role.kubernetes.io/control-plane
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
- key: node-role.kubernetes.io/master
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
topologySpreadConstraints:
|
|
||||||
- maxSkew: 1
|
|
||||||
topologyKey: kubernetes.io/hostname
|
|
||||||
whenUnsatisfiable: DoNotSchedule
|
|
||||||
labelSelector:
|
|
||||||
matchLabels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
- maxSkew: 1
|
|
||||||
topologyKey: topology.kubernetes.io/zone
|
|
||||||
whenUnsatisfiable: ScheduleAnyway
|
|
||||||
labelSelector:
|
|
||||||
matchLabels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
volumes:
|
|
||||||
- name: config-volume
|
|
||||||
configMap:
|
|
||||||
name: coredns
|
|
||||||
defaultMode: 420
|
|
||||||
items:
|
|
||||||
- key: Corefile
|
|
||||||
path: Corefile
|
|
||||||
- key: NodeHosts
|
|
||||||
path: NodeHosts
|
|
||||||
- name: custom-config-volume
|
|
||||||
configMap:
|
|
||||||
name: coredns-custom
|
|
||||||
optional: true
|
|
||||||
defaultMode: 420
|
|
||||||
@ -4,8 +4,5 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- ../modules/base
|
- ../modules/base
|
||||||
- ../modules/profiles/atlas-ha
|
- ../modules/profiles/atlas-ha
|
||||||
- coredns-custom.yaml
|
|
||||||
- coredns-deployment.yaml
|
|
||||||
- ntp-sync-daemonset.yaml
|
|
||||||
- ../sources/cert-manager/letsencrypt.yaml
|
- ../sources/cert-manager/letsencrypt.yaml
|
||||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||||
|
|||||||
@ -1,50 +0,0 @@
|
|||||||
# infrastructure/core/ntp-sync-daemonset.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: ntp-sync
|
|
||||||
namespace: kube-system
|
|
||||||
labels:
|
|
||||||
app: ntp-sync
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: ntp-sync
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: ntp-sync
|
|
||||||
spec:
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: node-role.kubernetes.io/control-plane
|
|
||||||
operator: DoesNotExist
|
|
||||||
- key: node-role.kubernetes.io/master
|
|
||||||
operator: DoesNotExist
|
|
||||||
containers:
|
|
||||||
- name: ntp-sync
|
|
||||||
image: public.ecr.aws/docker/library/busybox:1.36.1
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
command: ["/bin/sh", "-c"]
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
set -eu
|
|
||||||
while true; do
|
|
||||||
ntpd -q -p pool.ntp.org || true
|
|
||||||
sleep 300
|
|
||||||
done
|
|
||||||
securityContext:
|
|
||||||
capabilities:
|
|
||||||
add: ["SYS_TIME"]
|
|
||||||
runAsUser: 0
|
|
||||||
runAsGroup: 0
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 16Mi
|
|
||||||
limits:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 64Mi
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
# infrastructure/longhorn/adopt/kustomization.yaml
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- namespace.yaml
|
|
||||||
- longhorn-adopt-rbac.yaml
|
|
||||||
- longhorn-helm-adopt-job.yaml
|
|
||||||
|
|
||||||
configMapGenerator:
|
|
||||||
- name: longhorn-helm-adopt-script
|
|
||||||
namespace: longhorn-system
|
|
||||||
files:
|
|
||||||
- longhorn_helm_adopt.sh=scripts/longhorn_helm_adopt.sh
|
|
||||||
options:
|
|
||||||
disableNameSuffixHash: true
|
|
||||||
@ -1,56 +0,0 @@
|
|||||||
# infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: longhorn-helm-adopt
|
|
||||||
namespace: longhorn-system
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: longhorn-helm-adopt
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- configmaps
|
|
||||||
- services
|
|
||||||
- serviceaccounts
|
|
||||||
- secrets
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
- apiGroups: ["apps"]
|
|
||||||
resources:
|
|
||||||
- deployments
|
|
||||||
- daemonsets
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
- apiGroups: ["batch"]
|
|
||||||
resources:
|
|
||||||
- jobs
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- roles
|
|
||||||
- rolebindings
|
|
||||||
- clusterroles
|
|
||||||
- clusterrolebindings
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
- apiGroups: ["apiextensions.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- customresourcedefinitions
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
- apiGroups: ["scheduling.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- priorityclasses
|
|
||||||
verbs: ["get", "list", "watch", "patch", "update"]
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: longhorn-helm-adopt
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: longhorn-helm-adopt
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: longhorn-helm-adopt
|
|
||||||
namespace: longhorn-system
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
# infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: longhorn-helm-adopt-2
|
|
||||||
namespace: longhorn-system
|
|
||||||
spec:
|
|
||||||
backoffLimit: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
serviceAccountName: longhorn-helm-adopt
|
|
||||||
restartPolicy: Never
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: node-role.kubernetes.io/worker
|
|
||||||
operator: Exists
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: kubernetes.io/arch
|
|
||||||
operator: In
|
|
||||||
values: ["arm64"]
|
|
||||||
containers:
|
|
||||||
- name: adopt
|
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
|
||||||
command: ["/usr/bin/env", "bash"]
|
|
||||||
args: ["/scripts/longhorn_helm_adopt.sh"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: script
|
|
||||||
mountPath: /scripts
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: script
|
|
||||||
configMap:
|
|
||||||
name: longhorn-helm-adopt-script
|
|
||||||
defaultMode: 0555
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
# infrastructure/longhorn/adopt/namespace.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: longhorn-system
|
|
||||||
@ -1,52 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
release_name="longhorn"
|
|
||||||
release_namespace="longhorn-system"
|
|
||||||
selector="app.kubernetes.io/instance=${release_name}"
|
|
||||||
|
|
||||||
annotate_and_label() {
|
|
||||||
local scope="$1"
|
|
||||||
local kind="$2"
|
|
||||||
if [ "${scope}" = "namespaced" ]; then
|
|
||||||
kubectl -n "${release_namespace}" annotate "${kind}" -l "${selector}" \
|
|
||||||
meta.helm.sh/release-name="${release_name}" \
|
|
||||||
meta.helm.sh/release-namespace="${release_namespace}" \
|
|
||||||
--overwrite >/dev/null 2>&1 || true
|
|
||||||
kubectl -n "${release_namespace}" label "${kind}" -l "${selector}" \
|
|
||||||
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
|
|
||||||
else
|
|
||||||
kubectl annotate "${kind}" -l "${selector}" \
|
|
||||||
meta.helm.sh/release-name="${release_name}" \
|
|
||||||
meta.helm.sh/release-namespace="${release_namespace}" \
|
|
||||||
--overwrite >/dev/null 2>&1 || true
|
|
||||||
kubectl label "${kind}" -l "${selector}" \
|
|
||||||
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
namespaced_kinds=(
|
|
||||||
configmap
|
|
||||||
service
|
|
||||||
serviceaccount
|
|
||||||
deployment
|
|
||||||
daemonset
|
|
||||||
job
|
|
||||||
role
|
|
||||||
rolebinding
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_kinds=(
|
|
||||||
clusterrole
|
|
||||||
clusterrolebinding
|
|
||||||
customresourcedefinition
|
|
||||||
priorityclass
|
|
||||||
)
|
|
||||||
|
|
||||||
for kind in "${namespaced_kinds[@]}"; do
|
|
||||||
annotate_and_label "namespaced" "${kind}"
|
|
||||||
done
|
|
||||||
|
|
||||||
for kind in "${cluster_kinds[@]}"; do
|
|
||||||
annotate_and_label "cluster" "${kind}"
|
|
||||||
done
|
|
||||||
@ -1,80 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/helmrelease.yaml
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: longhorn
|
|
||||||
namespace: longhorn-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: longhorn
|
|
||||||
version: 1.8.2
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: longhorn
|
|
||||||
namespace: flux-system
|
|
||||||
install:
|
|
||||||
crds: Skip
|
|
||||||
remediation: { retries: 3 }
|
|
||||||
timeout: 15m
|
|
||||||
upgrade:
|
|
||||||
crds: Skip
|
|
||||||
remediation:
|
|
||||||
retries: 3
|
|
||||||
remediateLastFailure: true
|
|
||||||
cleanupOnFail: true
|
|
||||||
timeout: 15m
|
|
||||||
values:
|
|
||||||
service:
|
|
||||||
ui:
|
|
||||||
type: NodePort
|
|
||||||
nodePort: 30824
|
|
||||||
privateRegistry:
|
|
||||||
createSecret: false
|
|
||||||
registrySecret: longhorn-registry
|
|
||||||
image:
|
|
||||||
pullPolicy: Always
|
|
||||||
longhorn:
|
|
||||||
engine:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-engine
|
|
||||||
tag: v1.8.2
|
|
||||||
manager:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-manager
|
|
||||||
tag: v1.8.2
|
|
||||||
ui:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-ui
|
|
||||||
tag: v1.8.2
|
|
||||||
instanceManager:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-instance-manager
|
|
||||||
tag: v1.8.2
|
|
||||||
shareManager:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-share-manager
|
|
||||||
tag: v1.8.2
|
|
||||||
backingImageManager:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-backing-image-manager
|
|
||||||
tag: v1.8.2
|
|
||||||
supportBundleKit:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-support-bundle-kit
|
|
||||||
tag: v0.0.56
|
|
||||||
csi:
|
|
||||||
attacher:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-csi-attacher
|
|
||||||
tag: v4.9.0
|
|
||||||
provisioner:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-csi-provisioner
|
|
||||||
tag: v5.3.0
|
|
||||||
nodeDriverRegistrar:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-csi-node-driver-registrar
|
|
||||||
tag: v2.14.0
|
|
||||||
resizer:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-csi-resizer
|
|
||||||
tag: v1.13.2
|
|
||||||
snapshotter:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-csi-snapshotter
|
|
||||||
tag: v8.2.0
|
|
||||||
livenessProbe:
|
|
||||||
repository: registry.bstein.dev/infra/longhorn-livenessprobe
|
|
||||||
tag: v2.16.0
|
|
||||||
defaultSettings:
|
|
||||||
systemManagedPodsImagePullPolicy: Always
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/kustomization.yaml
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- namespace.yaml
|
|
||||||
- vault-serviceaccount.yaml
|
|
||||||
- secretproviderclass.yaml
|
|
||||||
- vault-sync-deployment.yaml
|
|
||||||
- helmrelease.yaml
|
|
||||||
- longhorn-settings-ensure-job.yaml
|
|
||||||
|
|
||||||
configMapGenerator:
|
|
||||||
- name: longhorn-settings-ensure-script
|
|
||||||
files:
|
|
||||||
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
|
|
||||||
|
|
||||||
generatorOptions:
|
|
||||||
disableNameSuffixHash: true
|
|
||||||
@ -1,36 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: longhorn-settings-ensure-4
|
|
||||||
namespace: longhorn-system
|
|
||||||
spec:
|
|
||||||
backoffLimit: 0
|
|
||||||
ttlSecondsAfterFinished: 3600
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
serviceAccountName: longhorn-service-account
|
|
||||||
restartPolicy: Never
|
|
||||||
volumes:
|
|
||||||
- name: longhorn-settings-ensure-script
|
|
||||||
configMap:
|
|
||||||
name: longhorn-settings-ensure-script
|
|
||||||
defaultMode: 0555
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: kubernetes.io/arch
|
|
||||||
operator: In
|
|
||||||
values: ["arm64"]
|
|
||||||
- key: node-role.kubernetes.io/worker
|
|
||||||
operator: Exists
|
|
||||||
containers:
|
|
||||||
- name: apply
|
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
|
||||||
command: ["/scripts/longhorn_settings_ensure.sh"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: longhorn-settings-ensure-script
|
|
||||||
mountPath: /scripts
|
|
||||||
readOnly: true
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/namespace.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: longhorn-system
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
#!/usr/bin/env sh
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
|
|
||||||
|
|
||||||
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
|
|
||||||
|
|
||||||
wait_for_api() {
|
|
||||||
attempts=30
|
|
||||||
while [ "${attempts}" -gt 0 ]; do
|
|
||||||
if curl -fsS "${api_base}" >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
attempts=$((attempts - 1))
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "Longhorn API not ready after retries." >&2
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
update_setting() {
|
|
||||||
name="$1"
|
|
||||||
value="$2"
|
|
||||||
|
|
||||||
current="$(curl -fsS "${api_base}/${name}" || true)"
|
|
||||||
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
|
|
||||||
echo "Setting ${name} already set."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Setting ${name} -> ${value}"
|
|
||||||
curl -fsS -X PUT \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"value\":\"${value}\"}" \
|
|
||||||
"${api_base}/${name}" >/dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_api
|
|
||||||
update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v1.8.2"
|
|
||||||
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
|
|
||||||
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
|
|
||||||
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/secretproviderclass.yaml
|
|
||||||
apiVersion: secrets-store.csi.x-k8s.io/v1
|
|
||||||
kind: SecretProviderClass
|
|
||||||
metadata:
|
|
||||||
name: longhorn-vault
|
|
||||||
namespace: longhorn-system
|
|
||||||
spec:
|
|
||||||
provider: vault
|
|
||||||
parameters:
|
|
||||||
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
|
|
||||||
roleName: "longhorn"
|
|
||||||
objects: |
|
|
||||||
- objectName: "harbor-pull__dockerconfigjson"
|
|
||||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
|
||||||
secretKey: "dockerconfigjson"
|
|
||||||
secretObjects:
|
|
||||||
- secretName: longhorn-registry
|
|
||||||
type: kubernetes.io/dockerconfigjson
|
|
||||||
data:
|
|
||||||
- objectName: harbor-pull__dockerconfigjson
|
|
||||||
key: .dockerconfigjson
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/vault-serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: longhorn-vault-sync
|
|
||||||
namespace: longhorn-system
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
# infrastructure/longhorn/core/vault-sync-deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: longhorn-vault-sync
|
|
||||||
namespace: longhorn-system
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: longhorn-vault-sync
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: longhorn-vault-sync
|
|
||||||
spec:
|
|
||||||
serviceAccountName: longhorn-vault-sync
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 80
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values: ["rpi5", "rpi4"]
|
|
||||||
containers:
|
|
||||||
- name: sync
|
|
||||||
image: alpine:3.20
|
|
||||||
command: ["/bin/sh", "-c"]
|
|
||||||
args:
|
|
||||||
- "sleep infinity"
|
|
||||||
volumeMounts:
|
|
||||||
- name: vault-secrets
|
|
||||||
mountPath: /vault/secrets
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: vault-secrets
|
|
||||||
csi:
|
|
||||||
driver: secrets-store.csi.k8s.io
|
|
||||||
readOnly: true
|
|
||||||
volumeAttributes:
|
|
||||||
secretProviderClass: longhorn-vault
|
|
||||||
@ -2,7 +2,6 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- serviceaccount.yaml
|
|
||||||
- oauth2-proxy-longhorn.yaml
|
|
||||||
- middleware.yaml
|
- middleware.yaml
|
||||||
- ingress.yaml
|
- ingress.yaml
|
||||||
|
- oauth2-proxy-longhorn.yaml
|
||||||
|
|||||||
@ -32,18 +32,7 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: oauth2-proxy-longhorn
|
app: oauth2-proxy-longhorn
|
||||||
annotations:
|
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
|
||||||
vault.hashicorp.com/role: "longhorn"
|
|
||||||
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/longhorn/oauth2-proxy"
|
|
||||||
vault.hashicorp.com/agent-inject-template-oidc-config: |
|
|
||||||
{{- with secret "kv/data/atlas/longhorn/oauth2-proxy" -}}
|
|
||||||
client_id = "{{ .Data.data.client_id }}"
|
|
||||||
client_secret = "{{ .Data.data.client_secret }}"
|
|
||||||
cookie_secret = "{{ .Data.data.cookie_secret }}"
|
|
||||||
{{- end -}}
|
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: longhorn-vault
|
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
node-role.kubernetes.io/worker: "true"
|
node-role.kubernetes.io/worker: "true"
|
||||||
affinity:
|
affinity:
|
||||||
@ -61,7 +50,6 @@ spec:
|
|||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
args:
|
args:
|
||||||
- --provider=oidc
|
- --provider=oidc
|
||||||
- --config=/vault/secrets/oidc-config
|
|
||||||
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
|
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
|
||||||
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
||||||
- --scope=openid profile email groups
|
- --scope=openid profile email groups
|
||||||
@ -81,6 +69,22 @@ spec:
|
|||||||
- --skip-jwt-bearer-tokens=true
|
- --skip-jwt-bearer-tokens=true
|
||||||
- --oidc-groups-claim=groups
|
- --oidc-groups-claim=groups
|
||||||
- --cookie-domain=longhorn.bstein.dev
|
- --cookie-domain=longhorn.bstein.dev
|
||||||
|
env:
|
||||||
|
- name: OAUTH2_PROXY_CLIENT_ID
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: oauth2-proxy-longhorn-oidc
|
||||||
|
key: client_id
|
||||||
|
- name: OAUTH2_PROXY_CLIENT_SECRET
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: oauth2-proxy-longhorn-oidc
|
||||||
|
key: client_secret
|
||||||
|
- name: OAUTH2_PROXY_COOKIE_SECRET
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: oauth2-proxy-longhorn-oidc
|
||||||
|
key: cookie_secret
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 4180
|
- containerPort: 4180
|
||||||
name: http
|
name: http
|
||||||
|
|||||||
@ -1,6 +0,0 @@
|
|||||||
# infrastructure/longhorn/ui-ingress/serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: longhorn-vault
|
|
||||||
namespace: longhorn-system
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
# infrastructure/metallb/helmrelease.yaml
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: metallb
|
|
||||||
namespace: metallb-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: metallb
|
|
||||||
version: 0.15.3
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: metallb
|
|
||||||
namespace: flux-system
|
|
||||||
install:
|
|
||||||
crds: CreateReplace
|
|
||||||
remediation: { retries: 3 }
|
|
||||||
timeout: 10m
|
|
||||||
upgrade:
|
|
||||||
crds: CreateReplace
|
|
||||||
remediation:
|
|
||||||
retries: 3
|
|
||||||
remediateLastFailure: true
|
|
||||||
cleanupOnFail: true
|
|
||||||
timeout: 10m
|
|
||||||
values:
|
|
||||||
loadBalancerClass: metallb
|
|
||||||
prometheus:
|
|
||||||
metricsPort: 7472
|
|
||||||
controller:
|
|
||||||
logLevel: info
|
|
||||||
webhookMode: enabled
|
|
||||||
tlsMinVersion: VersionTLS12
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- rpi4
|
|
||||||
- rpi5
|
|
||||||
speaker:
|
|
||||||
logLevel: info
|
|
||||||
@ -3,5 +3,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
|||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- namespace.yaml
|
- namespace.yaml
|
||||||
- helmrelease.yaml
|
- metallb-rendered.yaml
|
||||||
- ippool.yaml
|
- ippool.yaml
|
||||||
|
patchesStrategicMerge:
|
||||||
|
- patches/node-placement.yaml
|
||||||
|
- patches/speaker-loglevel.yaml
|
||||||
|
|||||||
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
2411
infrastructure/metallb/metallb-rendered.yaml
Normal file
File diff suppressed because it is too large
Load Diff
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
27
infrastructure/metallb/patches/node-placement.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# infrastructure/metallb/patches/node-placement.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: metallb-controller
|
||||||
|
namespace: metallb-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: controller
|
||||||
|
args:
|
||||||
|
- --port=7472
|
||||||
|
- --log-level=info
|
||||||
|
- --webhook-mode=enabled
|
||||||
|
- --tls-min-version=VersionTLS12
|
||||||
|
- --lb-class=metallb
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- rpi4
|
||||||
|
- rpi5
|
||||||
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
15
infrastructure/metallb/patches/speaker-loglevel.yaml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# infrastructure/metallb/patches/speaker-loglevel.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: metallb-speaker
|
||||||
|
namespace: metallb-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: speaker
|
||||||
|
args:
|
||||||
|
- --port=7472
|
||||||
|
- --log-level=info
|
||||||
|
- --lb-class=metallb
|
||||||
@ -1,24 +0,0 @@
|
|||||||
# infrastructure/modules/base/storageclass/asteria-encrypted.yaml
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
kind: StorageClass
|
|
||||||
metadata:
|
|
||||||
name: asteria-encrypted
|
|
||||||
parameters:
|
|
||||||
diskSelector: asteria
|
|
||||||
fromBackup: ""
|
|
||||||
numberOfReplicas: "2"
|
|
||||||
staleReplicaTimeout: "30"
|
|
||||||
fsType: "ext4"
|
|
||||||
replicaAutoBalance: "least-effort"
|
|
||||||
dataLocality: "disabled"
|
|
||||||
encrypted: "true"
|
|
||||||
csi.storage.k8s.io/provisioner-secret-name: ${pvc.name}
|
|
||||||
csi.storage.k8s.io/provisioner-secret-namespace: ${pvc.namespace}
|
|
||||||
csi.storage.k8s.io/node-publish-secret-name: ${pvc.name}
|
|
||||||
csi.storage.k8s.io/node-publish-secret-namespace: ${pvc.namespace}
|
|
||||||
csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}
|
|
||||||
csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace}
|
|
||||||
provisioner: driver.longhorn.io
|
|
||||||
reclaimPolicy: Retain
|
|
||||||
allowVolumeExpansion: true
|
|
||||||
volumeBindingMode: Immediate
|
|
||||||
@ -3,5 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
|||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- asteria.yaml
|
- asteria.yaml
|
||||||
- asteria-encrypted.yaml
|
|
||||||
- astreae.yaml
|
- astreae.yaml
|
||||||
|
|||||||
@ -11,5 +11,5 @@ spec:
|
|||||||
roleName: "postgres"
|
roleName: "postgres"
|
||||||
objects: |
|
objects: |
|
||||||
- objectName: "postgres_password"
|
- objectName: "postgres_password"
|
||||||
secretPath: "kv/data/atlas/postgres/postgres-db"
|
secretPath: "kv/data/postgres"
|
||||||
secretKey: "POSTGRES_PASSWORD"
|
secretKey: "POSTGRES_PASSWORD"
|
||||||
|
|||||||
@ -4,10 +4,6 @@ kind: Service
|
|||||||
metadata:
|
metadata:
|
||||||
name: postgres-service
|
name: postgres-service
|
||||||
namespace: postgres
|
namespace: postgres
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "9187"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
spec:
|
spec:
|
||||||
clusterIP: None
|
clusterIP: None
|
||||||
ports:
|
ports:
|
||||||
@ -15,9 +11,5 @@ spec:
|
|||||||
port: 5432
|
port: 5432
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
targetPort: 5432
|
targetPort: 5432
|
||||||
- name: metrics
|
|
||||||
port: 9187
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: 9187
|
|
||||||
selector:
|
selector:
|
||||||
app: postgres
|
app: postgres
|
||||||
|
|||||||
@ -58,23 +58,6 @@ spec:
|
|||||||
- name: vault-secrets
|
- name: vault-secrets
|
||||||
mountPath: /mnt/vault
|
mountPath: /mnt/vault
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: postgres-exporter
|
|
||||||
image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
|
|
||||||
ports:
|
|
||||||
- name: metrics
|
|
||||||
containerPort: 9187
|
|
||||||
protocol: TCP
|
|
||||||
env:
|
|
||||||
- name: DATA_SOURCE_URI
|
|
||||||
value: "localhost:5432/postgres?sslmode=disable"
|
|
||||||
- name: DATA_SOURCE_USER
|
|
||||||
value: postgres
|
|
||||||
- name: DATA_SOURCE_PASS_FILE
|
|
||||||
value: /mnt/vault/postgres_password
|
|
||||||
volumeMounts:
|
|
||||||
- name: vault-secrets
|
|
||||||
mountPath: /mnt/vault
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
volumes:
|
||||||
- name: vault-secrets
|
- name: vault-secrets
|
||||||
csi:
|
csi:
|
||||||
|
|||||||
@ -1,11 +1,10 @@
|
|||||||
# infrastructure/sources/cert-manager/letsencrypt-prod.yaml
|
|
||||||
apiVersion: cert-manager.io/v1
|
apiVersion: cert-manager.io/v1
|
||||||
kind: ClusterIssuer
|
kind: ClusterIssuer
|
||||||
metadata:
|
metadata:
|
||||||
name: letsencrypt-prod
|
name: letsencrypt-prod
|
||||||
spec:
|
spec:
|
||||||
acme:
|
acme:
|
||||||
email: brad@bstein.dev
|
email: brad.stein@gmail.com
|
||||||
server: https://acme-v02.api.letsencrypt.org/directory
|
server: https://acme-v02.api.letsencrypt.org/directory
|
||||||
privateKeySecretRef:
|
privateKeySecretRef:
|
||||||
name: letsencrypt-prod-account-key
|
name: letsencrypt-prod-account-key
|
||||||
|
|||||||
@ -1,11 +1,10 @@
|
|||||||
# infrastructure/sources/cert-manager/letsencrypt.yaml
|
|
||||||
apiVersion: cert-manager.io/v1
|
apiVersion: cert-manager.io/v1
|
||||||
kind: ClusterIssuer
|
kind: ClusterIssuer
|
||||||
metadata:
|
metadata:
|
||||||
name: letsencrypt
|
name: letsencrypt
|
||||||
spec:
|
spec:
|
||||||
acme:
|
acme:
|
||||||
email: brad@bstein.dev
|
email: brad.stein@gmail.com
|
||||||
server: https://acme-v02.api.letsencrypt.org/directory
|
server: https://acme-v02.api.letsencrypt.org/directory
|
||||||
privateKeySecretRef:
|
privateKeySecretRef:
|
||||||
name: letsencrypt-account-key
|
name: letsencrypt-account-key
|
||||||
|
|||||||
@ -1,9 +0,0 @@
|
|||||||
# infrastructure/sources/helm/ananace.yaml
|
|
||||||
apiVersion: source.toolkit.fluxcd.io/v1
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: ananace
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 1h
|
|
||||||
url: https://ananace.gitlab.io/charts
|
|
||||||
@ -2,18 +2,15 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- ananace.yaml
|
|
||||||
- fluent-bit.yaml
|
- fluent-bit.yaml
|
||||||
- grafana.yaml
|
- grafana.yaml
|
||||||
- hashicorp.yaml
|
- hashicorp.yaml
|
||||||
- jetstack.yaml
|
- jetstack.yaml
|
||||||
- jenkins.yaml
|
- jenkins.yaml
|
||||||
- mailu.yaml
|
- mailu.yaml
|
||||||
- metallb.yaml
|
|
||||||
- opentelemetry.yaml
|
- opentelemetry.yaml
|
||||||
- opensearch.yaml
|
- opensearch.yaml
|
||||||
- harbor.yaml
|
- harbor.yaml
|
||||||
- longhorn.yaml
|
|
||||||
- prometheus.yaml
|
- prometheus.yaml
|
||||||
- victoria-metrics.yaml
|
- victoria-metrics.yaml
|
||||||
- secrets-store-csi.yaml
|
- secrets-store-csi.yaml
|
||||||
|
|||||||
@ -1,9 +0,0 @@
|
|||||||
# infrastructure/sources/helm/longhorn.yaml
|
|
||||||
apiVersion: source.toolkit.fluxcd.io/v1
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: longhorn
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
url: https://charts.longhorn.io
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# infrastructure/sources/helm/metallb.yaml
|
|
||||||
apiVersion: source.toolkit.fluxcd.io/v1
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: metallb
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 1h
|
|
||||||
url: https://metallb.github.io/metallb
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -27,8 +27,6 @@ items:
|
|||||||
creationTimestamp: null
|
creationTimestamp: null
|
||||||
labels:
|
labels:
|
||||||
app: traefik
|
app: traefik
|
||||||
app.kubernetes.io/instance: traefik-kube-system
|
|
||||||
app.kubernetes.io/name: traefik
|
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
|
|||||||
@ -5,7 +5,6 @@ metadata:
|
|||||||
name: traefik
|
name: traefik
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
resources:
|
resources:
|
||||||
- crds.yaml
|
|
||||||
- deployment.yaml
|
- deployment.yaml
|
||||||
- serviceaccount.yaml
|
- serviceaccount.yaml
|
||||||
- clusterrole.yaml
|
- clusterrole.yaml
|
||||||
|
|||||||
@ -3,10 +3,9 @@ apiVersion: v1
|
|||||||
kind: Service
|
kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
name: traefik
|
name: traefik
|
||||||
namespace: traefik
|
namespace: kube-system
|
||||||
annotations:
|
annotations:
|
||||||
metallb.universe.tf/address-pool: communication-pool
|
metallb.universe.tf/address-pool: communication-pool
|
||||||
metallb.universe.tf/allow-shared-ip: traefik
|
|
||||||
spec:
|
spec:
|
||||||
type: LoadBalancer
|
type: LoadBalancer
|
||||||
loadBalancerClass: metallb
|
loadBalancerClass: metallb
|
||||||
@ -21,4 +20,5 @@ spec:
|
|||||||
targetPort: websecure
|
targetPort: websecure
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
selector:
|
selector:
|
||||||
app: traefik
|
app.kubernetes.io/instance: traefik-kube-system
|
||||||
|
app.kubernetes.io/name: traefik
|
||||||
|
|||||||
@ -17,5 +17,4 @@ spec:
|
|||||||
values:
|
values:
|
||||||
syncSecret:
|
syncSecret:
|
||||||
enabled: true
|
enabled: true
|
||||||
enableSecretRotation: true
|
enableSecretRotation: false
|
||||||
rotationPollInterval: 2m
|
|
||||||
|
|||||||
@ -1,43 +0,0 @@
|
|||||||
# infrastructure/vault-injector/helmrelease.yaml
|
|
||||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: vault-injector
|
|
||||||
namespace: vault
|
|
||||||
spec:
|
|
||||||
interval: 30m
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: vault
|
|
||||||
version: 0.31.0
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: hashicorp
|
|
||||||
namespace: flux-system
|
|
||||||
install:
|
|
||||||
remediation: { retries: 3 }
|
|
||||||
timeout: 10m
|
|
||||||
upgrade:
|
|
||||||
remediation:
|
|
||||||
retries: 3
|
|
||||||
remediateLastFailure: true
|
|
||||||
cleanupOnFail: true
|
|
||||||
timeout: 10m
|
|
||||||
values:
|
|
||||||
global:
|
|
||||||
externalVaultAddr: http://vault.vault.svc.cluster.local:8200
|
|
||||||
tlsDisable: true
|
|
||||||
server:
|
|
||||||
enabled: false
|
|
||||||
csi:
|
|
||||||
enabled: false
|
|
||||||
injector:
|
|
||||||
enabled: true
|
|
||||||
replicas: 1
|
|
||||||
agentImage:
|
|
||||||
repository: hashicorp/vault
|
|
||||||
tag: "1.17.6"
|
|
||||||
webhook:
|
|
||||||
failurePolicy: Ignore
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
# infrastructure/vault-injector/kustomization.yaml
|
|
||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
|
||||||
kind: Kustomization
|
|
||||||
resources:
|
|
||||||
- helmrelease.yaml
|
|
||||||
@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
"counts": {
|
"counts": {
|
||||||
"helmrelease_host_hints": 19,
|
"helmrelease_host_hints": 7,
|
||||||
"http_endpoints": 45,
|
"http_endpoints": 35,
|
||||||
"services": 47,
|
"services": 44,
|
||||||
"workloads": 74
|
"workloads": 49
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -17,11 +17,6 @@ flowchart LR
|
|||||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
||||||
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
||||||
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
||||||
host_budget_bstein_dev["budget.bstein.dev"]
|
|
||||||
svc_finance_actual_budget["finance/actual-budget (Service)"]
|
|
||||||
host_budget_bstein_dev --> svc_finance_actual_budget
|
|
||||||
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
|
|
||||||
svc_finance_actual_budget --> wl_finance_actual_budget
|
|
||||||
host_call_live_bstein_dev["call.live.bstein.dev"]
|
host_call_live_bstein_dev["call.live.bstein.dev"]
|
||||||
svc_comms_element_call["comms/element-call (Service)"]
|
svc_comms_element_call["comms/element-call (Service)"]
|
||||||
host_call_live_bstein_dev --> svc_comms_element_call
|
host_call_live_bstein_dev --> svc_comms_element_call
|
||||||
@ -42,11 +37,6 @@ flowchart LR
|
|||||||
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
||||||
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
||||||
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
||||||
host_health_bstein_dev["health.bstein.dev"]
|
|
||||||
svc_health_wger["health/wger (Service)"]
|
|
||||||
host_health_bstein_dev --> svc_health_wger
|
|
||||||
wl_health_wger["health/wger (Deployment)"]
|
|
||||||
svc_health_wger --> wl_health_wger
|
|
||||||
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
||||||
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
||||||
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
||||||
@ -57,22 +47,15 @@ flowchart LR
|
|||||||
wl_comms_livekit["comms/livekit (Deployment)"]
|
wl_comms_livekit["comms/livekit (Deployment)"]
|
||||||
svc_comms_livekit --> wl_comms_livekit
|
svc_comms_livekit --> wl_comms_livekit
|
||||||
host_live_bstein_dev["live.bstein.dev"]
|
host_live_bstein_dev["live.bstein.dev"]
|
||||||
|
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
|
||||||
|
host_live_bstein_dev --> svc_comms_othrys_element_element_web
|
||||||
|
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
|
||||||
|
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
|
||||||
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||||
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
||||||
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
|
||||||
host_live_bstein_dev --> svc_comms_matrix_guest_register
|
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
|
||||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
|
||||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
|
||||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
|
||||||
host_live_bstein_dev --> svc_comms_matrix_authentication_service
|
|
||||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
|
||||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
|
||||||
host_logs_bstein_dev["logs.bstein.dev"]
|
|
||||||
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
|
|
||||||
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
|
|
||||||
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
|
|
||||||
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
|
|
||||||
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
||||||
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
||||||
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
||||||
@ -82,25 +65,21 @@ flowchart LR
|
|||||||
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
||||||
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
||||||
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
||||||
|
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||||
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||||
|
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||||
|
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||||
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||||
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||||
|
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||||
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||||
|
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||||
|
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||||
host_monero_bstein_dev["monero.bstein.dev"]
|
host_monero_bstein_dev["monero.bstein.dev"]
|
||||||
svc_crypto_monerod["crypto/monerod (Service)"]
|
svc_crypto_monerod["crypto/monerod (Service)"]
|
||||||
host_monero_bstein_dev --> svc_crypto_monerod
|
host_monero_bstein_dev --> svc_crypto_monerod
|
||||||
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
||||||
svc_crypto_monerod --> wl_crypto_monerod
|
svc_crypto_monerod --> wl_crypto_monerod
|
||||||
host_money_bstein_dev["money.bstein.dev"]
|
|
||||||
svc_finance_firefly["finance/firefly (Service)"]
|
|
||||||
host_money_bstein_dev --> svc_finance_firefly
|
|
||||||
wl_finance_firefly["finance/firefly (Deployment)"]
|
|
||||||
svc_finance_firefly --> wl_finance_firefly
|
|
||||||
host_notes_bstein_dev["notes.bstein.dev"]
|
|
||||||
svc_outline_outline["outline/outline (Service)"]
|
|
||||||
host_notes_bstein_dev --> svc_outline_outline
|
|
||||||
wl_outline_outline["outline/outline (Deployment)"]
|
|
||||||
svc_outline_outline --> wl_outline_outline
|
|
||||||
host_office_bstein_dev["office.bstein.dev"]
|
host_office_bstein_dev["office.bstein.dev"]
|
||||||
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
||||||
host_office_bstein_dev --> svc_nextcloud_collabora
|
host_office_bstein_dev --> svc_nextcloud_collabora
|
||||||
@ -131,11 +110,6 @@ flowchart LR
|
|||||||
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
||||||
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
||||||
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
||||||
host_tasks_bstein_dev["tasks.bstein.dev"]
|
|
||||||
svc_planka_planka["planka/planka (Service)"]
|
|
||||||
host_tasks_bstein_dev --> svc_planka_planka
|
|
||||||
wl_planka_planka["planka/planka (Deployment)"]
|
|
||||||
svc_planka_planka --> wl_planka_planka
|
|
||||||
host_vault_bstein_dev["vault.bstein.dev"]
|
host_vault_bstein_dev["vault.bstein.dev"]
|
||||||
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
||||||
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
||||||
@ -159,30 +133,23 @@ flowchart LR
|
|||||||
wl_comms_livekit_token_service
|
wl_comms_livekit_token_service
|
||||||
svc_comms_livekit
|
svc_comms_livekit
|
||||||
wl_comms_livekit
|
wl_comms_livekit
|
||||||
|
svc_comms_othrys_element_element_web
|
||||||
|
wl_comms_othrys_element_element_web
|
||||||
svc_comms_othrys_synapse_matrix_synapse
|
svc_comms_othrys_synapse_matrix_synapse
|
||||||
svc_comms_matrix_guest_register
|
wl_comms_othrys_synapse_matrix_synapse
|
||||||
wl_comms_matrix_guest_register
|
|
||||||
svc_comms_matrix_authentication_service
|
svc_comms_matrix_authentication_service
|
||||||
wl_comms_matrix_authentication_service
|
wl_comms_matrix_authentication_service
|
||||||
|
svc_comms_matrix_guest_register
|
||||||
|
wl_comms_matrix_guest_register
|
||||||
end
|
end
|
||||||
subgraph crypto[crypto]
|
subgraph crypto[crypto]
|
||||||
svc_crypto_monerod
|
svc_crypto_monerod
|
||||||
wl_crypto_monerod
|
wl_crypto_monerod
|
||||||
end
|
end
|
||||||
subgraph finance[finance]
|
|
||||||
svc_finance_actual_budget
|
|
||||||
wl_finance_actual_budget
|
|
||||||
svc_finance_firefly
|
|
||||||
wl_finance_firefly
|
|
||||||
end
|
|
||||||
subgraph gitea[gitea]
|
subgraph gitea[gitea]
|
||||||
svc_gitea_gitea
|
svc_gitea_gitea
|
||||||
wl_gitea_gitea
|
wl_gitea_gitea
|
||||||
end
|
end
|
||||||
subgraph health[health]
|
|
||||||
svc_health_wger
|
|
||||||
wl_health_wger
|
|
||||||
end
|
|
||||||
subgraph jellyfin[jellyfin]
|
subgraph jellyfin[jellyfin]
|
||||||
svc_jellyfin_pegasus
|
svc_jellyfin_pegasus
|
||||||
wl_jellyfin_pegasus
|
wl_jellyfin_pegasus
|
||||||
@ -193,10 +160,6 @@ flowchart LR
|
|||||||
svc_jenkins_jenkins
|
svc_jenkins_jenkins
|
||||||
wl_jenkins_jenkins
|
wl_jenkins_jenkins
|
||||||
end
|
end
|
||||||
subgraph logging[logging]
|
|
||||||
svc_logging_oauth2_proxy_logs
|
|
||||||
wl_logging_oauth2_proxy_logs
|
|
||||||
end
|
|
||||||
subgraph longhorn_system[longhorn-system]
|
subgraph longhorn_system[longhorn-system]
|
||||||
svc_longhorn_system_oauth2_proxy_longhorn
|
svc_longhorn_system_oauth2_proxy_longhorn
|
||||||
wl_longhorn_system_oauth2_proxy_longhorn
|
wl_longhorn_system_oauth2_proxy_longhorn
|
||||||
@ -210,14 +173,6 @@ flowchart LR
|
|||||||
svc_nextcloud_collabora
|
svc_nextcloud_collabora
|
||||||
wl_nextcloud_collabora
|
wl_nextcloud_collabora
|
||||||
end
|
end
|
||||||
subgraph outline[outline]
|
|
||||||
svc_outline_outline
|
|
||||||
wl_outline_outline
|
|
||||||
end
|
|
||||||
subgraph planka[planka]
|
|
||||||
svc_planka_planka
|
|
||||||
wl_planka_planka
|
|
||||||
end
|
|
||||||
subgraph sso[sso]
|
subgraph sso[sso]
|
||||||
svc_sso_oauth2_proxy
|
svc_sso_oauth2_proxy
|
||||||
wl_sso_oauth2_proxy
|
wl_sso_oauth2_proxy
|
||||||
|
|||||||
@ -70,7 +70,6 @@ WORKER_NODES = [
|
|||||||
"titan-13",
|
"titan-13",
|
||||||
"titan-14",
|
"titan-14",
|
||||||
"titan-15",
|
"titan-15",
|
||||||
"titan-16",
|
|
||||||
"titan-17",
|
"titan-17",
|
||||||
"titan-18",
|
"titan-18",
|
||||||
"titan-19",
|
"titan-19",
|
||||||
@ -86,17 +85,19 @@ WORKER_TOTAL = len(WORKER_NODES)
|
|||||||
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
||||||
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
||||||
# Namespaces considered infrastructure (excluded from workload counts)
|
# Namespaces considered infrastructure (excluded from workload counts)
|
||||||
INFRA_PATTERNS = [
|
INFRA_NAMESPACES = [
|
||||||
"kube-.*",
|
"kube-system",
|
||||||
".*-system",
|
"longhorn-system",
|
||||||
"traefik",
|
"metallb-system",
|
||||||
"monitoring",
|
"monitoring",
|
||||||
"logging",
|
"logging",
|
||||||
"cert-manager",
|
"cert-manager",
|
||||||
|
"flux-system",
|
||||||
|
"traefik",
|
||||||
"maintenance",
|
"maintenance",
|
||||||
"postgres",
|
"postgres",
|
||||||
]
|
]
|
||||||
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
|
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
|
||||||
# Namespaces allowed on control plane without counting as workloads
|
# Namespaces allowed on control plane without counting as workloads
|
||||||
CP_ALLOWED_NS = INFRA_REGEX
|
CP_ALLOWED_NS = INFRA_REGEX
|
||||||
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
||||||
@ -208,66 +209,7 @@ def namespace_ram_raw(scope_var):
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_usage_instant(scope_var):
|
def namespace_gpu_usage_instant(scope_var):
|
||||||
return gpu_usage_by_namespace(scope_var)
|
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_util_by_node():
|
|
||||||
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
|
||||||
|
|
||||||
|
|
||||||
def dcgm_gpu_util_by_node():
|
|
||||||
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
|
|
||||||
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
|
|
||||||
return (
|
|
||||||
"avg by (node) ("
|
|
||||||
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
|
|
||||||
'kube_pod_info{namespace="monitoring"}'
|
|
||||||
")"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_util_by_node():
|
|
||||||
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_util_by_hostname():
|
|
||||||
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_node_labels():
|
|
||||||
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_requests_by_namespace_node(scope_var):
|
|
||||||
return (
|
|
||||||
"sum by (namespace,node) ("
|
|
||||||
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
|
||||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
||||||
f"* on(node) group_left() ({gpu_node_labels()})"
|
|
||||||
")"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_usage_by_namespace(scope_var):
|
|
||||||
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
|
||||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
|
||||||
return (
|
|
||||||
"sum by (namespace) ("
|
|
||||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
|
||||||
f"* on(node) group_left() ({gpu_util_by_node()})"
|
|
||||||
")"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_usage_by_namespace(scope_var):
|
|
||||||
requests_by_ns = jetson_gpu_requests(scope_var)
|
|
||||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
|
||||||
return (
|
|
||||||
"sum by (namespace) ("
|
|
||||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
|
||||||
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
|
|
||||||
")"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_share_expr(resource_expr):
|
def namespace_share_expr(resource_expr):
|
||||||
@ -287,7 +229,7 @@ def namespace_gpu_share_expr(scope_var):
|
|||||||
usage = namespace_gpu_usage_instant(scope_var)
|
usage = namespace_gpu_usage_instant(scope_var)
|
||||||
total = f"(sum({usage}) or on() vector(0))"
|
total = f"(sum({usage}) or on() vector(0))"
|
||||||
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
||||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
|
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
|
||||||
return f"({share}) or ({idle})"
|
return f"({share}) or ({idle})"
|
||||||
|
|
||||||
|
|
||||||
@ -377,76 +319,6 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
|||||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||||
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||||
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
|
|
||||||
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
|
|
||||||
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
|
|
||||||
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
|
|
||||||
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
|
|
||||||
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
|
|
||||||
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
|
|
||||||
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
|
|
||||||
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
|
|
||||||
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
|
|
||||||
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
|
|
||||||
GLUE_STALE_WINDOW_SEC = 36 * 3600
|
|
||||||
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
|
|
||||||
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
|
|
||||||
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
|
||||||
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
|
||||||
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
|
|
||||||
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
|
|
||||||
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
|
|
||||||
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
|
|
||||||
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
|
|
||||||
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
|
|
||||||
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
|
|
||||||
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
|
|
||||||
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
|
|
||||||
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
|
|
||||||
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
|
|
||||||
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
|
|
||||||
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
|
|
||||||
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
|
|
||||||
ARIADNE_TASK_WARNINGS_SERIES = (
|
|
||||||
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
|
|
||||||
)
|
|
||||||
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
|
|
||||||
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
|
|
||||||
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
|
|
||||||
"(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
|
|
||||||
)
|
|
||||||
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
|
||||||
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
|
||||||
)
|
|
||||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
|
||||||
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
|
||||||
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
|
||||||
ARIADNE_TEST_SUCCESS_RATE = (
|
|
||||||
"100 * "
|
|
||||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
|
|
||||||
"/ clamp_min("
|
|
||||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
|
|
||||||
)
|
|
||||||
ARIADNE_TEST_FAILURES_24H = (
|
|
||||||
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
|
||||||
)
|
|
||||||
POSTGRES_CONN_USED = (
|
|
||||||
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
|
||||||
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
|
|
||||||
)
|
|
||||||
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
|
|
||||||
ONEOFF_JOB_OWNER = (
|
|
||||||
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
|
|
||||||
)
|
|
||||||
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
|
|
||||||
ONEOFF_JOB_POD_AGE_HOURS = (
|
|
||||||
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
|
|
||||||
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
|
|
||||||
'* on(namespace,pod) group_left(phase) '
|
|
||||||
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
|
|
||||||
)
|
|
||||||
GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
|
|
||||||
GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
|
|
||||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
@ -624,7 +496,6 @@ def timeseries_panel(
|
|||||||
grid,
|
grid,
|
||||||
*,
|
*,
|
||||||
unit="none",
|
unit="none",
|
||||||
max_value=None,
|
|
||||||
legend=None,
|
legend=None,
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
legend_placement="bottom",
|
legend_placement="bottom",
|
||||||
@ -649,8 +520,6 @@ def timeseries_panel(
|
|||||||
"tooltip": {"mode": "multi"},
|
"tooltip": {"mode": "multi"},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if max_value is not None:
|
|
||||||
panel["fieldConfig"]["defaults"]["max"] = max_value
|
|
||||||
if legend:
|
if legend:
|
||||||
panel["targets"][0]["legendFormat"] = legend
|
panel["targets"][0]["legendFormat"] = legend
|
||||||
if legend_calcs:
|
if legend_calcs:
|
||||||
@ -802,22 +671,13 @@ def bargauge_panel(
|
|||||||
grid,
|
grid,
|
||||||
*,
|
*,
|
||||||
unit="none",
|
unit="none",
|
||||||
legend=None,
|
|
||||||
links=None,
|
links=None,
|
||||||
limit=None,
|
limit=None,
|
||||||
sort_order="desc",
|
|
||||||
thresholds=None,
|
thresholds=None,
|
||||||
decimals=None,
|
decimals=None,
|
||||||
instant=False,
|
instant=False,
|
||||||
overrides=None,
|
|
||||||
):
|
):
|
||||||
"""Return a bar gauge panel with label-aware reduction."""
|
"""Return a bar gauge panel with label-aware reduction."""
|
||||||
cleaned_expr = expr.strip()
|
|
||||||
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
|
|
||||||
if sort_order == "desc":
|
|
||||||
expr = f"sort_desc({expr})"
|
|
||||||
elif sort_order == "asc":
|
|
||||||
expr = f"sort({expr})"
|
|
||||||
panel = {
|
panel = {
|
||||||
"id": panel_id,
|
"id": panel_id,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
@ -825,12 +685,7 @@ def bargauge_panel(
|
|||||||
"datasource": PROM_DS,
|
"datasource": PROM_DS,
|
||||||
"gridPos": grid,
|
"gridPos": grid,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
|
||||||
"expr": expr,
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": legend or "{{node}}",
|
|
||||||
**({"instant": True} if instant else {}),
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
@ -860,8 +715,6 @@ def bargauge_panel(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if overrides:
|
|
||||||
panel["fieldConfig"]["overrides"].extend(overrides)
|
|
||||||
if decimals is not None:
|
if decimals is not None:
|
||||||
panel["fieldConfig"]["defaults"]["decimals"] = decimals
|
panel["fieldConfig"]["defaults"]["decimals"] = decimals
|
||||||
if links:
|
if links:
|
||||||
@ -870,7 +723,7 @@ def bargauge_panel(
|
|||||||
panel["transformations"] = [
|
panel["transformations"] = [
|
||||||
{
|
{
|
||||||
"id": "sortBy",
|
"id": "sortBy",
|
||||||
"options": {"fields": ["Value"], "order": sort_order},
|
"options": {"fields": ["Value"], "order": "desc"},
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
if limit:
|
if limit:
|
||||||
@ -910,15 +763,6 @@ def build_overview():
|
|||||||
{"color": "red", "value": 3},
|
{"color": "red", "value": 3},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
age_thresholds = {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 6},
|
|
||||||
{"color": "orange", "value": 24},
|
|
||||||
{"color": "red", "value": 48},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
row1_stats = [
|
row1_stats = [
|
||||||
{
|
{
|
||||||
@ -1121,7 +965,7 @@ def build_overview():
|
|||||||
30,
|
30,
|
||||||
"Mail Sent (1d)",
|
"Mail Sent (1d)",
|
||||||
'max(postmark_outbound_sent{window="1d"})',
|
'max(postmark_outbound_sent{window="1d"})',
|
||||||
{"h": 3, "w": 4, "x": 0, "y": 8},
|
{"h": 2, "w": 6, "x": 0, "y": 8},
|
||||||
unit="none",
|
unit="none",
|
||||||
links=link_to("atlas-mail"),
|
links=link_to("atlas-mail"),
|
||||||
)
|
)
|
||||||
@ -1132,7 +976,7 @@ def build_overview():
|
|||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Mail Bounces (1d)",
|
"title": "Mail Bounces (1d)",
|
||||||
"datasource": PROM_DS,
|
"datasource": PROM_DS,
|
||||||
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
|
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
||||||
@ -1178,7 +1022,7 @@ def build_overview():
|
|||||||
32,
|
32,
|
||||||
"Mail Success Rate (1d)",
|
"Mail Success Rate (1d)",
|
||||||
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
||||||
{"h": 3, "w": 4, "x": 4, "y": 8},
|
{"h": 2, "w": 6, "x": 6, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
thresholds=mail_success_thresholds,
|
thresholds=mail_success_thresholds,
|
||||||
decimals=1,
|
decimals=1,
|
||||||
@ -1190,38 +1034,13 @@ def build_overview():
|
|||||||
33,
|
33,
|
||||||
"Mail Limit Used (30d)",
|
"Mail Limit Used (30d)",
|
||||||
"max(postmark_sending_limit_used_percent)",
|
"max(postmark_sending_limit_used_percent)",
|
||||||
{"h": 3, "w": 4, "x": 12, "y": 8},
|
{"h": 2, "w": 6, "x": 18, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
thresholds=mail_limit_thresholds,
|
thresholds=mail_limit_thresholds,
|
||||||
decimals=1,
|
decimals=1,
|
||||||
links=link_to("atlas-mail"),
|
links=link_to("atlas-mail"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
34,
|
|
||||||
"Postgres Connections Used",
|
|
||||||
POSTGRES_CONN_USED,
|
|
||||||
{"h": 3, "w": 4, "x": 16, "y": 8},
|
|
||||||
decimals=0,
|
|
||||||
text_mode="name_and_value",
|
|
||||||
legend="{{conn}}",
|
|
||||||
instant=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
35,
|
|
||||||
"Postgres Hottest Connections",
|
|
||||||
POSTGRES_CONN_HOTTEST,
|
|
||||||
{"h": 3, "w": 4, "x": 20, "y": 8},
|
|
||||||
unit="none",
|
|
||||||
decimals=0,
|
|
||||||
text_mode="name_and_value",
|
|
||||||
legend="{{datname}}",
|
|
||||||
instant=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
storage_panels = [
|
storage_panels = [
|
||||||
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||||
@ -1235,104 +1054,13 @@ def build_overview():
|
|||||||
panel_id,
|
panel_id,
|
||||||
title,
|
title,
|
||||||
expr,
|
expr,
|
||||||
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
|
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
||||||
unit=unit,
|
unit=unit,
|
||||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||||
links=link_to("atlas-storage"),
|
links=link_to("atlas-storage"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
40,
|
|
||||||
"One-off Job Pods (age hours)",
|
|
||||||
ONEOFF_JOB_POD_AGE_HOURS,
|
|
||||||
{"h": 6, "w": 6, "x": 0, "y": 14},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{namespace}}/{{pod}}",
|
|
||||||
thresholds=age_thresholds,
|
|
||||||
limit=8,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
{
|
|
||||||
"id": 41,
|
|
||||||
"type": "timeseries",
|
|
||||||
"title": "Ariadne Attempts / Failures",
|
|
||||||
"datasource": PROM_DS,
|
|
||||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
|
|
||||||
"targets": [
|
|
||||||
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
|
|
||||||
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {"unit": "none"},
|
|
||||||
"overrides": [
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "Attempts"},
|
|
||||||
"properties": [
|
|
||||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "Failures"},
|
|
||||||
"properties": [
|
|
||||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"legend": {"displayMode": "table", "placement": "right"},
|
|
||||||
"tooltip": {"mode": "multi"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
timeseries_panel(
|
|
||||||
42,
|
|
||||||
"Ariadne Test Success Rate",
|
|
||||||
ARIADNE_TEST_SUCCESS_RATE,
|
|
||||||
{"h": 6, "w": 6, "x": 12, "y": 14},
|
|
||||||
unit="percent",
|
|
||||||
max_value=100,
|
|
||||||
legend=None,
|
|
||||||
legend_display="list",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
43,
|
|
||||||
"Tests with Failures (24h)",
|
|
||||||
ARIADNE_TEST_FAILURES_24H,
|
|
||||||
{"h": 6, "w": 6, "x": 18, "y": 14},
|
|
||||||
unit="none",
|
|
||||||
instant=True,
|
|
||||||
legend="{{result}}",
|
|
||||||
overrides=[
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "error"},
|
|
||||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "failed"},
|
|
||||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
thresholds={
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 1},
|
|
||||||
{"color": "orange", "value": 5},
|
|
||||||
{"color": "red", "value": 10},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cpu_scope = "$namespace_scope_cpu"
|
cpu_scope = "$namespace_scope_cpu"
|
||||||
gpu_scope = "$namespace_scope_gpu"
|
gpu_scope = "$namespace_scope_gpu"
|
||||||
ram_scope = "$namespace_scope_ram"
|
ram_scope = "$namespace_scope_ram"
|
||||||
@ -1342,9 +1070,9 @@ def build_overview():
|
|||||||
11,
|
11,
|
||||||
"Namespace CPU Share",
|
"Namespace CPU Share",
|
||||||
namespace_cpu_share_expr(cpu_scope),
|
namespace_cpu_share_expr(cpu_scope),
|
||||||
{"h": 9, "w": 8, "x": 0, "y": 20},
|
{"h": 9, "w": 8, "x": 0, "y": 16},
|
||||||
links=namespace_scope_links("namespace_scope_cpu"),
|
links=namespace_scope_links("namespace_scope_cpu"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -1352,9 +1080,9 @@ def build_overview():
|
|||||||
12,
|
12,
|
||||||
"Namespace GPU Share",
|
"Namespace GPU Share",
|
||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 20},
|
{"h": 9, "w": 8, "x": 8, "y": 16},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -1362,9 +1090,9 @@ def build_overview():
|
|||||||
13,
|
13,
|
||||||
"Namespace RAM Share",
|
"Namespace RAM Share",
|
||||||
namespace_ram_share_expr(ram_scope),
|
namespace_ram_share_expr(ram_scope),
|
||||||
{"h": 9, "w": 8, "x": 16, "y": 20},
|
{"h": 9, "w": 8, "x": 16, "y": 16},
|
||||||
links=namespace_scope_links("namespace_scope_ram"),
|
links=namespace_scope_links("namespace_scope_ram"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1374,7 +1102,7 @@ def build_overview():
|
|||||||
14,
|
14,
|
||||||
"Worker Node CPU",
|
"Worker Node CPU",
|
||||||
node_cpu_expr(worker_filter),
|
node_cpu_expr(worker_filter),
|
||||||
{"h": 12, "w": 12, "x": 0, "y": 36},
|
{"h": 12, "w": 12, "x": 0, "y": 32},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -1388,7 +1116,7 @@ def build_overview():
|
|||||||
15,
|
15,
|
||||||
"Worker Node RAM",
|
"Worker Node RAM",
|
||||||
node_mem_expr(worker_filter),
|
node_mem_expr(worker_filter),
|
||||||
{"h": 12, "w": 12, "x": 12, "y": 36},
|
{"h": 12, "w": 12, "x": 12, "y": 32},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -1403,7 +1131,7 @@ def build_overview():
|
|||||||
16,
|
16,
|
||||||
"Control plane CPU",
|
"Control plane CPU",
|
||||||
node_cpu_expr(CONTROL_ALL_REGEX),
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
||||||
{"h": 10, "w": 12, "x": 0, "y": 48},
|
{"h": 10, "w": 12, "x": 0, "y": 44},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -1415,7 +1143,7 @@ def build_overview():
|
|||||||
17,
|
17,
|
||||||
"Control plane RAM",
|
"Control plane RAM",
|
||||||
node_mem_expr(CONTROL_ALL_REGEX),
|
node_mem_expr(CONTROL_ALL_REGEX),
|
||||||
{"h": 10, "w": 12, "x": 12, "y": 48},
|
{"h": 10, "w": 12, "x": 12, "y": 44},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
@ -1428,7 +1156,7 @@ def build_overview():
|
|||||||
28,
|
28,
|
||||||
"Node Pod Share",
|
"Node Pod Share",
|
||||||
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
||||||
{"h": 10, "w": 12, "x": 0, "y": 58},
|
{"h": 10, "w": 12, "x": 0, "y": 54},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -1436,7 +1164,7 @@ def build_overview():
|
|||||||
29,
|
29,
|
||||||
"Top Nodes by Pod Count",
|
"Top Nodes by Pod Count",
|
||||||
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
||||||
{"h": 10, "w": 12, "x": 12, "y": 58},
|
{"h": 10, "w": 12, "x": 12, "y": 54},
|
||||||
unit="none",
|
unit="none",
|
||||||
limit=12,
|
limit=12,
|
||||||
decimals=0,
|
decimals=0,
|
||||||
@ -1458,7 +1186,7 @@ def build_overview():
|
|||||||
18,
|
18,
|
||||||
"Cluster Ingress Throughput",
|
"Cluster Ingress Throughput",
|
||||||
NET_INGRESS_EXPR,
|
NET_INGRESS_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 0, "y": 29},
|
{"h": 7, "w": 8, "x": 0, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="Ingress (Traefik)",
|
legend="Ingress (Traefik)",
|
||||||
legend_display="list",
|
legend_display="list",
|
||||||
@ -1471,7 +1199,7 @@ def build_overview():
|
|||||||
19,
|
19,
|
||||||
"Cluster Egress Throughput",
|
"Cluster Egress Throughput",
|
||||||
NET_EGRESS_EXPR,
|
NET_EGRESS_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 8, "y": 29},
|
{"h": 7, "w": 8, "x": 8, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="Egress (Traefik)",
|
legend="Egress (Traefik)",
|
||||||
legend_display="list",
|
legend_display="list",
|
||||||
@ -1484,7 +1212,7 @@ def build_overview():
|
|||||||
20,
|
20,
|
||||||
"Intra-Cluster Throughput",
|
"Intra-Cluster Throughput",
|
||||||
NET_INTERNAL_EXPR,
|
NET_INTERNAL_EXPR,
|
||||||
{"h": 7, "w": 8, "x": 16, "y": 29},
|
{"h": 7, "w": 8, "x": 16, "y": 25},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="Internal traffic",
|
legend="Internal traffic",
|
||||||
legend_display="list",
|
legend_display="list",
|
||||||
@ -1498,7 +1226,7 @@ def build_overview():
|
|||||||
21,
|
21,
|
||||||
"Root Filesystem Usage",
|
"Root Filesystem Usage",
|
||||||
root_usage_expr(),
|
root_usage_expr(),
|
||||||
{"h": 16, "w": 12, "x": 0, "y": 68},
|
{"h": 16, "w": 12, "x": 0, "y": 64},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
legend_calcs=["last"],
|
legend_calcs=["last"],
|
||||||
@ -1513,7 +1241,7 @@ def build_overview():
|
|||||||
22,
|
22,
|
||||||
"Nodes Closest to Full Root Disks",
|
"Nodes Closest to Full Root Disks",
|
||||||
f"topk(12, {root_usage_expr()})",
|
f"topk(12, {root_usage_expr()})",
|
||||||
{"h": 16, "w": 12, "x": 12, "y": 68},
|
{"h": 16, "w": 12, "x": 12, "y": 64},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
thresholds=PERCENT_THRESHOLDS,
|
||||||
links=link_to("atlas-storage"),
|
links=link_to("atlas-storage"),
|
||||||
@ -1999,7 +1727,7 @@ def build_storage_dashboard():
|
|||||||
stat_panel(
|
stat_panel(
|
||||||
31,
|
31,
|
||||||
"Maintenance Cron Freshness (s)",
|
"Maintenance Cron Freshness (s)",
|
||||||
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
|
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
|
||||||
{"h": 4, "w": 12, "x": 12, "y": 44},
|
{"h": 4, "w": 12, "x": 12, "y": 44},
|
||||||
unit="s",
|
unit="s",
|
||||||
thresholds={
|
thresholds={
|
||||||
@ -2408,285 +2136,6 @@ def build_mail_dashboard():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def build_jobs_dashboard():
|
|
||||||
panels = []
|
|
||||||
age_thresholds = {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 6},
|
|
||||||
{"color": "orange", "value": 24},
|
|
||||||
{"color": "red", "value": 48},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
recent_error_thresholds = {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "red", "value": None},
|
|
||||||
{"color": "orange", "value": 1},
|
|
||||||
{"color": "yellow", "value": 6},
|
|
||||||
{"color": "green", "value": 24},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
task_error_thresholds = {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 1},
|
|
||||||
{"color": "orange", "value": 3},
|
|
||||||
{"color": "red", "value": 5},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
1,
|
|
||||||
"Ariadne Task Errors (range)",
|
|
||||||
ARIADNE_TASK_ERRORS_RANGE,
|
|
||||||
{"h": 7, "w": 8, "x": 0, "y": 0},
|
|
||||||
unit="none",
|
|
||||||
instant=True,
|
|
||||||
legend="{{task}}",
|
|
||||||
thresholds=task_error_thresholds,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"type": "timeseries",
|
|
||||||
"title": "Ariadne Attempts / Failures",
|
|
||||||
"datasource": PROM_DS,
|
|
||||||
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
|
|
||||||
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {"unit": "none"},
|
|
||||||
"overrides": [
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "Attempts"},
|
|
||||||
"properties": [
|
|
||||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "Failures"},
|
|
||||||
"properties": [
|
|
||||||
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"legend": {"displayMode": "table", "placement": "right"},
|
|
||||||
"tooltip": {"mode": "multi"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
3,
|
|
||||||
"One-off Job Pods (age hours)",
|
|
||||||
ONEOFF_JOB_POD_AGE_HOURS,
|
|
||||||
{"h": 7, "w": 8, "x": 16, "y": 0},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{namespace}}/{{pod}}",
|
|
||||||
thresholds=age_thresholds,
|
|
||||||
limit=12,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
4,
|
|
||||||
"Glue Jobs Stale (>36h)",
|
|
||||||
GLUE_STALE_COUNT,
|
|
||||||
{"h": 4, "w": 4, "x": 0, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
thresholds={
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 1},
|
|
||||||
{"color": "orange", "value": 2},
|
|
||||||
{"color": "red", "value": 3},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
5,
|
|
||||||
"Glue Jobs Missing Success",
|
|
||||||
GLUE_MISSING_COUNT,
|
|
||||||
{"h": 4, "w": 4, "x": 4, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
6,
|
|
||||||
"Glue Jobs Suspended",
|
|
||||||
GLUE_SUSPENDED_COUNT,
|
|
||||||
{"h": 4, "w": 4, "x": 8, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
7,
|
|
||||||
"Ariadne Task Errors (1h)",
|
|
||||||
ARIADNE_TASK_ERRORS_1H_TOTAL,
|
|
||||||
{"h": 4, "w": 4, "x": 12, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
8,
|
|
||||||
"Ariadne Task Errors (24h)",
|
|
||||||
ARIADNE_TASK_ERRORS_24H_TOTAL,
|
|
||||||
{"h": 4, "w": 4, "x": 16, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
9,
|
|
||||||
"Ariadne Task Runs (1h)",
|
|
||||||
ARIADNE_TASK_RUNS_1H_TOTAL,
|
|
||||||
{"h": 4, "w": 4, "x": 20, "y": 7},
|
|
||||||
unit="none",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
10,
|
|
||||||
"Ariadne Schedule Last Error (hours ago)",
|
|
||||||
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
|
|
||||||
{"h": 6, "w": 12, "x": 0, "y": 17},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{task}}",
|
|
||||||
thresholds=recent_error_thresholds,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
11,
|
|
||||||
"Ariadne Schedule Last Success (hours ago)",
|
|
||||||
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
|
|
||||||
{"h": 6, "w": 12, "x": 12, "y": 17},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{task}}",
|
|
||||||
thresholds=age_thresholds,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
12,
|
|
||||||
"Glue Jobs Last Success (hours ago)",
|
|
||||||
GLUE_LAST_SUCCESS_RANGE_HOURS,
|
|
||||||
{"h": 6, "w": 12, "x": 0, "y": 23},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{namespace}}/{{cronjob}}",
|
|
||||||
thresholds=age_thresholds,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
13,
|
|
||||||
"Glue Jobs Last Schedule (hours ago)",
|
|
||||||
GLUE_LAST_SCHEDULE_RANGE_HOURS,
|
|
||||||
{"h": 6, "w": 12, "x": 12, "y": 23},
|
|
||||||
unit="h",
|
|
||||||
instant=True,
|
|
||||||
legend="{{namespace}}/{{cronjob}}",
|
|
||||||
thresholds=age_thresholds,
|
|
||||||
decimals=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
14,
|
|
||||||
"Ariadne Task Errors (1h)",
|
|
||||||
ARIADNE_TASK_ERRORS_1H,
|
|
||||||
{"h": 6, "w": 12, "x": 0, "y": 29},
|
|
||||||
unit="none",
|
|
||||||
instant=True,
|
|
||||||
legend="{{task}}",
|
|
||||||
thresholds=task_error_thresholds,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
15,
|
|
||||||
"Ariadne Task Errors (30d)",
|
|
||||||
ARIADNE_TASK_ERRORS_30D,
|
|
||||||
{"h": 6, "w": 12, "x": 12, "y": 29},
|
|
||||||
unit="none",
|
|
||||||
instant=True,
|
|
||||||
legend="{{task}}",
|
|
||||||
thresholds=task_error_thresholds,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
bargauge_panel(
|
|
||||||
16,
|
|
||||||
"Ariadne Access Requests",
|
|
||||||
ARIADNE_ACCESS_REQUESTS,
|
|
||||||
{"h": 6, "w": 8, "x": 0, "y": 11},
|
|
||||||
unit="none",
|
|
||||||
instant=True,
|
|
||||||
legend="{{status}}",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
17,
|
|
||||||
"Ariadne CI Coverage (%)",
|
|
||||||
ARIADNE_CI_COVERAGE,
|
|
||||||
{"h": 6, "w": 4, "x": 8, "y": 11},
|
|
||||||
unit="percent",
|
|
||||||
decimals=1,
|
|
||||||
instant=True,
|
|
||||||
legend="{{branch}}",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
table_panel(
|
|
||||||
18,
|
|
||||||
"Ariadne CI Tests (latest)",
|
|
||||||
ARIADNE_CI_TESTS,
|
|
||||||
{"h": 6, "w": 12, "x": 12, "y": 11},
|
|
||||||
unit="none",
|
|
||||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
|
||||||
instant=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"uid": "atlas-jobs",
|
|
||||||
"title": "Atlas Jobs",
|
|
||||||
"folderUid": PRIVATE_FOLDER,
|
|
||||||
"editable": True,
|
|
||||||
"panels": panels,
|
|
||||||
"time": {"from": "now-7d", "to": "now"},
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"style": "dark",
|
|
||||||
"tags": ["atlas", "jobs", "glue"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def build_gpu_dashboard():
|
def build_gpu_dashboard():
|
||||||
panels = []
|
panels = []
|
||||||
gpu_scope = "$namespace_scope_gpu"
|
gpu_scope = "$namespace_scope_gpu"
|
||||||
@ -2697,7 +2146,7 @@ def build_gpu_dashboard():
|
|||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Values are normalized within the selected scope; use panel links to switch scope.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -2716,7 +2165,7 @@ def build_gpu_dashboard():
|
|||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
3,
|
3,
|
||||||
"GPU Util by Node",
|
"GPU Util by Node",
|
||||||
gpu_util_by_hostname(),
|
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{Hostname}}",
|
legend="{{Hostname}}",
|
||||||
@ -2780,10 +2229,6 @@ DASHBOARDS = {
|
|||||||
"builder": build_mail_dashboard,
|
"builder": build_mail_dashboard,
|
||||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
||||||
},
|
},
|
||||||
"atlas-jobs": {
|
|
||||||
"builder": build_jobs_dashboard,
|
|
||||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
|
|
||||||
},
|
|
||||||
"atlas-gpu": {
|
"atlas-gpu": {
|
||||||
"builder": build_gpu_dashboard,
|
"builder": build_gpu_dashboard,
|
||||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
||||||
|
|||||||
@ -20,13 +20,11 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
|
||||||
from typing import Any, Iterable
|
from typing import Any, Iterable
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
|
|
||||||
|
|
||||||
CLUSTER_SCOPED_KINDS = {
|
CLUSTER_SCOPED_KINDS = {
|
||||||
"Namespace",
|
"Namespace",
|
||||||
@ -62,70 +60,6 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
|
|||||||
return res.stdout
|
return res.stdout
|
||||||
|
|
||||||
|
|
||||||
def _sync_tree(source: Path, dest: Path) -> None:
|
|
||||||
if dest.exists():
|
|
||||||
shutil.rmtree(dest)
|
|
||||||
shutil.copytree(source, dest)
|
|
||||||
|
|
||||||
|
|
||||||
def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
|
|
||||||
panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
|
|
||||||
for panel in panels:
|
|
||||||
if not isinstance(panel, dict):
|
|
||||||
continue
|
|
||||||
if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
|
|
||||||
yield from _iter_dashboard_panels({"panels": panel.get("panels")})
|
|
||||||
continue
|
|
||||||
yield panel
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
|
|
||||||
index: list[dict[str, Any]] = []
|
|
||||||
for path in sorted(dashboard_dir.glob("*.json")):
|
|
||||||
try:
|
|
||||||
data = json.loads(path.read_text(encoding="utf-8"))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
if not isinstance(data, dict):
|
|
||||||
continue
|
|
||||||
dash_title = data.get("title") or path.stem
|
|
||||||
dash_tags = data.get("tags") or []
|
|
||||||
for panel in _iter_dashboard_panels(data):
|
|
||||||
targets = panel.get("targets")
|
|
||||||
if not isinstance(targets, list):
|
|
||||||
continue
|
|
||||||
exprs: list[str] = []
|
|
||||||
for target in targets:
|
|
||||||
if not isinstance(target, dict):
|
|
||||||
continue
|
|
||||||
expr = target.get("expr")
|
|
||||||
if isinstance(expr, str) and expr.strip():
|
|
||||||
exprs.append(expr.strip())
|
|
||||||
if not exprs:
|
|
||||||
continue
|
|
||||||
datasource = panel.get("datasource") or {}
|
|
||||||
if isinstance(datasource, dict):
|
|
||||||
ds_uid = datasource.get("uid")
|
|
||||||
ds_type = datasource.get("type")
|
|
||||||
else:
|
|
||||||
ds_uid = None
|
|
||||||
ds_type = None
|
|
||||||
index.append(
|
|
||||||
{
|
|
||||||
"dashboard": dash_title,
|
|
||||||
"panel_title": panel.get("title") or "",
|
|
||||||
"panel_id": panel.get("id"),
|
|
||||||
"panel_type": panel.get("type"),
|
|
||||||
"description": panel.get("description") or "",
|
|
||||||
"tags": dash_tags,
|
|
||||||
"datasource_uid": ds_uid,
|
|
||||||
"datasource_type": ds_type,
|
|
||||||
"exprs": exprs,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def kustomize_build(path: Path) -> str:
|
def kustomize_build(path: Path) -> str:
|
||||||
rel = path.relative_to(REPO_ROOT)
|
rel = path.relative_to(REPO_ROOT)
|
||||||
try:
|
try:
|
||||||
@ -538,11 +472,6 @@ def main() -> int:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Write generated files (otherwise just print a summary).",
|
help="Write generated files (otherwise just print a summary).",
|
||||||
)
|
)
|
||||||
ap.add_argument(
|
|
||||||
"--sync-comms",
|
|
||||||
action="store_true",
|
|
||||||
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
|
|
||||||
)
|
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
out_dir = REPO_ROOT / args.out
|
out_dir = REPO_ROOT / args.out
|
||||||
@ -575,11 +504,8 @@ def main() -> int:
|
|||||||
summary_path = out_dir / "catalog" / "atlas-summary.json"
|
summary_path = out_dir / "catalog" / "atlas-summary.json"
|
||||||
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
|
diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
|
||||||
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
|
runbooks_json_path = out_dir / "catalog" / "runbooks.json"
|
||||||
metrics_json_path = out_dir / "catalog" / "metrics.json"
|
|
||||||
|
|
||||||
catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
|
|
||||||
catalog_path.write_text(
|
catalog_path.write_text(
|
||||||
f"# {catalog_rel}\n"
|
|
||||||
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
|
"# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
|
||||||
+ yaml.safe_dump(catalog, sort_keys=False),
|
+ yaml.safe_dump(catalog, sort_keys=False),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
@ -589,14 +515,9 @@ def main() -> int:
|
|||||||
diagram_path.write_text(diagram, encoding="utf-8")
|
diagram_path.write_text(diagram, encoding="utf-8")
|
||||||
|
|
||||||
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
|
# Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
|
||||||
runbook_dirs = [
|
runbooks_dir = out_dir / "runbooks"
|
||||||
out_dir / "runbooks",
|
|
||||||
out_dir / "software",
|
|
||||||
]
|
|
||||||
runbooks: list[dict[str, Any]] = []
|
runbooks: list[dict[str, Any]] = []
|
||||||
for runbooks_dir in runbook_dirs:
|
if runbooks_dir.exists():
|
||||||
if not runbooks_dir.exists():
|
|
||||||
continue
|
|
||||||
for md_file in sorted(runbooks_dir.glob("*.md")):
|
for md_file in sorted(runbooks_dir.glob("*.md")):
|
||||||
raw = md_file.read_text(encoding="utf-8")
|
raw = md_file.read_text(encoding="utf-8")
|
||||||
fm: dict[str, Any] = {}
|
fm: dict[str, Any] = {}
|
||||||
@ -620,22 +541,12 @@ def main() -> int:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
|
runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
|
||||||
metrics_index = _extract_metrics_index(DASHBOARD_DIR)
|
|
||||||
metrics_json_path.write_text(
|
|
||||||
json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
|
print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
|
||||||
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
|
print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
|
||||||
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
|
print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
|
||||||
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
|
print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
|
||||||
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
||||||
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
|
|
||||||
|
|
||||||
if args.sync_comms:
|
|
||||||
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
|
|
||||||
_sync_tree(out_dir, comms_dir)
|
|
||||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -7,8 +7,6 @@ test accounts created via the bstein-dev-home onboarding portal.
|
|||||||
Targets (best-effort):
|
Targets (best-effort):
|
||||||
- Keycloak users in realm "atlas"
|
- Keycloak users in realm "atlas"
|
||||||
- Atlas portal Postgres rows (access_requests + dependent tables)
|
- Atlas portal Postgres rows (access_requests + dependent tables)
|
||||||
- Mailu mailboxes created for test users
|
|
||||||
- Nextcloud Mail accounts created for test users
|
|
||||||
- Vaultwarden users/invites created by the portal
|
- Vaultwarden users/invites created by the portal
|
||||||
|
|
||||||
Safety:
|
Safety:
|
||||||
@ -58,19 +56,6 @@ class VaultwardenUser:
|
|||||||
status: int
|
status: int
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class MailuUser:
|
|
||||||
email: str
|
|
||||||
localpart: str
|
|
||||||
domain: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class NextcloudMailAccount:
|
|
||||||
account_id: str
|
|
||||||
email: str
|
|
||||||
|
|
||||||
|
|
||||||
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
||||||
proc = subprocess.run(
|
proc = subprocess.run(
|
||||||
cmd,
|
cmd,
|
||||||
@ -85,19 +70,6 @@ def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
|
|||||||
return proc.stdout.decode("utf-8", errors="replace")
|
return proc.stdout.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
def _run_capture(cmd: list[str], *, input_bytes: bytes | None = None) -> tuple[int, str, str]:
|
|
||||||
proc = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
input=input_bytes,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
check=False,
|
|
||||||
)
|
|
||||||
stdout = proc.stdout.decode("utf-8", errors="replace")
|
|
||||||
stderr = proc.stderr.decode("utf-8", errors="replace")
|
|
||||||
return proc.returncode, stdout, stderr
|
|
||||||
|
|
||||||
|
|
||||||
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
|
def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
|
||||||
raw_b64 = _run(
|
raw_b64 = _run(
|
||||||
[
|
[
|
||||||
@ -138,21 +110,6 @@ def _kubectl_first_pod(namespace: str) -> str:
|
|||||||
return pod_name
|
return pod_name
|
||||||
|
|
||||||
|
|
||||||
def _kubectl_exec(namespace: str, target: str, cmd: list[str]) -> tuple[int, str, str]:
|
|
||||||
return _run_capture(
|
|
||||||
[
|
|
||||||
"kubectl",
|
|
||||||
"-n",
|
|
||||||
namespace,
|
|
||||||
"exec",
|
|
||||||
"-i",
|
|
||||||
target,
|
|
||||||
"--",
|
|
||||||
*cmd,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _validate_prefixes(prefixes: list[str]) -> list[str]:
|
def _validate_prefixes(prefixes: list[str]) -> list[str]:
|
||||||
cleaned: list[str] = []
|
cleaned: list[str] = []
|
||||||
for prefix in prefixes:
|
for prefix in prefixes:
|
||||||
@ -230,62 +187,6 @@ def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) ->
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _sql_quote(value: str) -> str:
|
|
||||||
return "'" + value.replace("'", "''") + "'"
|
|
||||||
|
|
||||||
|
|
||||||
def _psql_exec(db_name: str, sql: str, *, user: str = "postgres") -> str:
|
|
||||||
postgres_pod = _kubectl_first_pod("postgres")
|
|
||||||
return _run(
|
|
||||||
[
|
|
||||||
"kubectl",
|
|
||||||
"-n",
|
|
||||||
"postgres",
|
|
||||||
"exec",
|
|
||||||
"-i",
|
|
||||||
postgres_pod,
|
|
||||||
"--",
|
|
||||||
"psql",
|
|
||||||
"-U",
|
|
||||||
user,
|
|
||||||
"-d",
|
|
||||||
db_name,
|
|
||||||
"-c",
|
|
||||||
sql,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _psql_tsv(db_name: str, sql: str, *, user: str = "postgres") -> list[list[str]]:
|
|
||||||
postgres_pod = _kubectl_first_pod("postgres")
|
|
||||||
out = _run(
|
|
||||||
[
|
|
||||||
"kubectl",
|
|
||||||
"-n",
|
|
||||||
"postgres",
|
|
||||||
"exec",
|
|
||||||
"-i",
|
|
||||||
postgres_pod,
|
|
||||||
"--",
|
|
||||||
"psql",
|
|
||||||
"-U",
|
|
||||||
user,
|
|
||||||
"-d",
|
|
||||||
db_name,
|
|
||||||
"-At",
|
|
||||||
"-F",
|
|
||||||
"\t",
|
|
||||||
"-c",
|
|
||||||
sql,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
rows: list[list[str]] = []
|
|
||||||
for line in out.splitlines():
|
|
||||||
parts = line.split("\t")
|
|
||||||
rows.append(parts)
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
|
def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
|
||||||
postgres_pod = _kubectl_first_pod("postgres")
|
postgres_pod = _kubectl_first_pod("postgres")
|
||||||
out = _run(
|
out = _run(
|
||||||
@ -355,89 +256,6 @@ def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
|
|||||||
return int(match.group(1)) if match else 0
|
return int(match.group(1)) if match else 0
|
||||||
|
|
||||||
|
|
||||||
def _mailu_list_users(prefixes: list[str], domain: str, db_name: str, protected: set[str]) -> list[MailuUser]:
|
|
||||||
if not prefixes or not domain:
|
|
||||||
return []
|
|
||||||
clauses = " OR ".join([f"localpart LIKE '{p}%'" for p in prefixes])
|
|
||||||
sql = (
|
|
||||||
'SELECT email, localpart, domain_name '
|
|
||||||
'FROM "user" '
|
|
||||||
f"WHERE domain_name = {_sql_quote(domain)} AND ({clauses}) "
|
|
||||||
"ORDER BY email;"
|
|
||||||
)
|
|
||||||
rows = _psql_tsv(db_name, sql)
|
|
||||||
users: list[MailuUser] = []
|
|
||||||
for row in rows:
|
|
||||||
if len(row) < 3:
|
|
||||||
continue
|
|
||||||
email = row[0].strip()
|
|
||||||
if not email or email in protected:
|
|
||||||
continue
|
|
||||||
users.append(MailuUser(email=email, localpart=row[1].strip(), domain=row[2].strip()))
|
|
||||||
return users
|
|
||||||
|
|
||||||
|
|
||||||
def _mailu_delete_users(db_name: str, emails: list[str]) -> int:
|
|
||||||
if not emails:
|
|
||||||
return 0
|
|
||||||
email_list = ",".join(_sql_quote(e) for e in emails)
|
|
||||||
sql = f'DELETE FROM "user" WHERE email IN ({email_list});'
|
|
||||||
out = _psql_exec(db_name, sql)
|
|
||||||
match = re.search(r"DELETE\\s+(\\d+)", out)
|
|
||||||
return int(match.group(1)) if match else 0
|
|
||||||
|
|
||||||
|
|
||||||
_NEXTCLOUD_ACCOUNT_RE = re.compile(r"^Account\\s+(\\d+):")
|
|
||||||
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+")
|
|
||||||
|
|
||||||
|
|
||||||
def _nextcloud_exec(cmd: list[str]) -> tuple[int, str, str]:
|
|
||||||
namespace = os.getenv("NEXTCLOUD_NAMESPACE", "nextcloud").strip() or "nextcloud"
|
|
||||||
target = os.getenv("NEXTCLOUD_EXEC_TARGET", "deploy/nextcloud").strip() or "deploy/nextcloud"
|
|
||||||
return _kubectl_exec(namespace, target, cmd)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_nextcloud_mail_accounts(export_output: str) -> list[NextcloudMailAccount]:
|
|
||||||
accounts: list[NextcloudMailAccount] = []
|
|
||||||
current_id = ""
|
|
||||||
for line in export_output.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
match = _NEXTCLOUD_ACCOUNT_RE.match(line)
|
|
||||||
if match:
|
|
||||||
current_id = match.group(1)
|
|
||||||
continue
|
|
||||||
if not current_id or "@" not in line:
|
|
||||||
continue
|
|
||||||
email_match = _EMAIL_RE.search(line)
|
|
||||||
if not email_match:
|
|
||||||
continue
|
|
||||||
accounts.append(NextcloudMailAccount(account_id=current_id, email=email_match.group(0)))
|
|
||||||
current_id = ""
|
|
||||||
return accounts
|
|
||||||
|
|
||||||
|
|
||||||
def _nextcloud_list_mail_accounts(username: str) -> list[NextcloudMailAccount]:
|
|
||||||
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
|
|
||||||
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:export", username])
|
|
||||||
if rc != 0:
|
|
||||||
message = (err or out).strip()
|
|
||||||
lowered = message.lower()
|
|
||||||
if any(token in lowered for token in ("not found", "does not exist", "no such user", "unknown user")):
|
|
||||||
return []
|
|
||||||
raise RuntimeError(f"nextcloud mail export failed for {username}: {message}")
|
|
||||||
return _parse_nextcloud_mail_accounts(out)
|
|
||||||
|
|
||||||
|
|
||||||
def _nextcloud_delete_mail_account(account_id: str) -> None:
|
|
||||||
occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
|
|
||||||
rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:delete", "-q", account_id])
|
|
||||||
if rc != 0:
|
|
||||||
message = (err or out).strip()
|
|
||||||
raise RuntimeError(f"nextcloud mail delete failed for account {account_id}: {message}")
|
|
||||||
|
|
||||||
|
|
||||||
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
|
def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
|
||||||
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
|
data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
|
||||||
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
|
req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
|
||||||
@ -538,8 +356,6 @@ def main() -> int:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
|
parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
|
||||||
parser.add_argument("--skip-mailu", action="store_true", help="Skip Mailu mailbox cleanup.")
|
|
||||||
parser.add_argument("--skip-nextcloud-mail", action="store_true", help="Skip Nextcloud Mail account cleanup.")
|
|
||||||
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
|
parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
|
||||||
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
|
parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -548,18 +364,6 @@ def main() -> int:
|
|||||||
default=[],
|
default=[],
|
||||||
help="Keycloak usernames that must never be deleted (repeatable).",
|
help="Keycloak usernames that must never be deleted (repeatable).",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--protect-mailu-email",
|
|
||||||
action="append",
|
|
||||||
default=[],
|
|
||||||
help="Mailu emails that must never be deleted (repeatable).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--protect-nextcloud-username",
|
|
||||||
action="append",
|
|
||||||
default=[],
|
|
||||||
help="Nextcloud usernames that must never be touched (repeatable).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--protect-vaultwarden-email",
|
"--protect-vaultwarden-email",
|
||||||
action="append",
|
action="append",
|
||||||
@ -572,11 +376,7 @@ def main() -> int:
|
|||||||
apply = bool(args.apply)
|
apply = bool(args.apply)
|
||||||
expected_confirm = ",".join(prefixes)
|
expected_confirm = ",".join(prefixes)
|
||||||
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
|
protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
|
||||||
protected_mailu = {e.strip() for e in args.protect_mailu_email if e.strip()}
|
|
||||||
protected_nextcloud = {u.strip() for u in args.protect_nextcloud_username if u.strip()}
|
|
||||||
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
|
protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
|
||||||
mailu_domain = os.getenv("MAILU_DOMAIN", "bstein.dev").strip() or "bstein.dev"
|
|
||||||
mailu_db_name = os.getenv("MAILU_DB_NAME", "mailu").strip() or "mailu"
|
|
||||||
|
|
||||||
if apply and args.confirm != expected_confirm:
|
if apply and args.confirm != expected_confirm:
|
||||||
raise SystemExit(
|
raise SystemExit(
|
||||||
@ -588,29 +388,23 @@ def main() -> int:
|
|||||||
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
|
print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
|
||||||
if protected_keycloak:
|
if protected_keycloak:
|
||||||
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
|
print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
|
||||||
if protected_mailu:
|
|
||||||
print("protected mailu emails:", ", ".join(sorted(protected_mailu)))
|
|
||||||
if protected_nextcloud:
|
|
||||||
print("protected nextcloud usernames:", ", ".join(sorted(protected_nextcloud)))
|
|
||||||
if protected_vaultwarden:
|
if protected_vaultwarden:
|
||||||
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
|
print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
portal_requests: list[PortalRequestRow] = []
|
|
||||||
if not args.skip_portal_db:
|
if not args.skip_portal_db:
|
||||||
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
|
portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
|
||||||
portal_requests = _portal_list_requests(portal_db_url, prefixes)
|
requests = _portal_list_requests(portal_db_url, prefixes)
|
||||||
print(f"Portal DB: {len(portal_requests)} access_requests matched")
|
print(f"Portal DB: {len(requests)} access_requests matched")
|
||||||
for row in portal_requests[:50]:
|
for row in requests[:50]:
|
||||||
print(f" {row.request_code}\t{row.status}\t{row.username}")
|
print(f" {row.request_code}\t{row.status}\t{row.username}")
|
||||||
if len(portal_requests) > 50:
|
if len(requests) > 50:
|
||||||
print(f" ... and {len(portal_requests) - 50} more")
|
print(f" ... and {len(requests) - 50} more")
|
||||||
if apply and portal_requests:
|
if apply and requests:
|
||||||
deleted = _portal_delete_requests(portal_db_url, prefixes)
|
deleted = _portal_delete_requests(portal_db_url, prefixes)
|
||||||
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
|
print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
keycloak_users: list[KeycloakUser] = []
|
|
||||||
if not args.skip_keycloak:
|
if not args.skip_keycloak:
|
||||||
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
|
kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
|
||||||
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
|
kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
|
||||||
@ -627,63 +421,18 @@ def main() -> int:
|
|||||||
if user.username in protected_keycloak:
|
if user.username in protected_keycloak:
|
||||||
continue
|
continue
|
||||||
found[user.user_id] = user
|
found[user.user_id] = user
|
||||||
keycloak_users = list(found.values())
|
users = list(found.values())
|
||||||
keycloak_users.sort(key=lambda u: u.username)
|
users.sort(key=lambda u: u.username)
|
||||||
print(f"Keycloak: {len(keycloak_users)} users matched")
|
print(f"Keycloak: {len(users)} users matched")
|
||||||
for user in keycloak_users[:50]:
|
for user in users[:50]:
|
||||||
email = user.email or "-"
|
email = user.email or "-"
|
||||||
print(f" {user.username}\t{email}\t{user.user_id}")
|
print(f" {user.username}\t{email}\t{user.user_id}")
|
||||||
if len(keycloak_users) > 50:
|
if len(users) > 50:
|
||||||
print(f" ... and {len(keycloak_users) - 50} more")
|
print(f" ... and {len(users) - 50} more")
|
||||||
if apply and keycloak_users:
|
if apply and users:
|
||||||
for user in keycloak_users:
|
for user in users:
|
||||||
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
|
_keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
|
||||||
print(f"Keycloak: deleted {len(keycloak_users)} users.")
|
print(f"Keycloak: deleted {len(users)} users.")
|
||||||
print()
|
|
||||||
|
|
||||||
if not args.skip_mailu:
|
|
||||||
mailu_users = _mailu_list_users(prefixes, mailu_domain, mailu_db_name, protected_mailu)
|
|
||||||
print(f"Mailu: {len(mailu_users)} mailboxes matched (domain={mailu_domain})")
|
|
||||||
for user in mailu_users[:50]:
|
|
||||||
print(f" {user.email}\t{user.localpart}\t{user.domain}")
|
|
||||||
if len(mailu_users) > 50:
|
|
||||||
print(f" ... and {len(mailu_users) - 50} more")
|
|
||||||
if apply and mailu_users:
|
|
||||||
deleted = _mailu_delete_users(mailu_db_name, [u.email for u in mailu_users])
|
|
||||||
print(f"Mailu: deleted {deleted} mailboxes.")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if not args.skip_nextcloud_mail:
|
|
||||||
nextcloud_usernames = {row.username for row in portal_requests if row.username}
|
|
||||||
nextcloud_usernames.update({u.username for u in keycloak_users if u.username})
|
|
||||||
nextcloud_usernames = {u for u in nextcloud_usernames if _starts_with_any(u, prefixes)}
|
|
||||||
nextcloud_usernames = {u for u in nextcloud_usernames if u not in protected_nextcloud}
|
|
||||||
|
|
||||||
matches: list[tuple[str, NextcloudMailAccount]] = []
|
|
||||||
for username in sorted(nextcloud_usernames):
|
|
||||||
accounts = _nextcloud_list_mail_accounts(username)
|
|
||||||
for account in accounts:
|
|
||||||
email = account.email.strip()
|
|
||||||
if not email:
|
|
||||||
continue
|
|
||||||
if not email.lower().endswith(f"@{mailu_domain.lower()}"):
|
|
||||||
continue
|
|
||||||
localpart = email.split("@", 1)[0]
|
|
||||||
if not _starts_with_any(localpart, prefixes):
|
|
||||||
continue
|
|
||||||
if email in protected_mailu:
|
|
||||||
continue
|
|
||||||
matches.append((username, account))
|
|
||||||
|
|
||||||
print(f"Nextcloud Mail: {len(matches)} accounts matched")
|
|
||||||
for username, account in matches[:50]:
|
|
||||||
print(f" {username}\t{account.account_id}\t{account.email}")
|
|
||||||
if len(matches) > 50:
|
|
||||||
print(f" ... and {len(matches) - 50} more")
|
|
||||||
if apply and matches:
|
|
||||||
for _, account in matches:
|
|
||||||
_nextcloud_delete_mail_account(account.account_id)
|
|
||||||
print(f"Nextcloud Mail: deleted {len(matches)} accounts.")
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
if not args.skip_vaultwarden:
|
if not args.skip_vaultwarden:
|
||||||
|
|||||||
@ -55,11 +55,11 @@ class _FakeResponse:
|
|||||||
|
|
||||||
|
|
||||||
class _FakeSession:
|
class _FakeSession:
|
||||||
def __init__(self, put_resp, get_resps):
|
def __init__(self, put_resp, get_resp):
|
||||||
self.put_resp = put_resp
|
self.put_resp = put_resp
|
||||||
self.get_resps = list(get_resps)
|
self.get_resp = get_resp
|
||||||
self.put_called = False
|
self.put_called = False
|
||||||
self.get_calls = 0
|
self.get_called = False
|
||||||
|
|
||||||
def post(self, *args, **kwargs):
|
def post(self, *args, **kwargs):
|
||||||
return _FakeResponse({"access_token": "dummy"})
|
return _FakeResponse({"access_token": "dummy"})
|
||||||
@ -69,26 +69,22 @@ class _FakeSession:
|
|||||||
return self.put_resp
|
return self.put_resp
|
||||||
|
|
||||||
def get(self, *args, **kwargs):
|
def get(self, *args, **kwargs):
|
||||||
self.get_calls += 1
|
self.get_called = True
|
||||||
if self.get_resps:
|
return self.get_resp
|
||||||
return self.get_resps.pop(0)
|
|
||||||
return _FakeResponse({})
|
|
||||||
|
|
||||||
|
|
||||||
def test_kc_update_attributes_succeeds(monkeypatch):
|
def test_kc_update_attributes_succeeds(monkeypatch):
|
||||||
sync = load_sync_module(monkeypatch)
|
sync = load_sync_module(monkeypatch)
|
||||||
current_resp = _FakeResponse({"attributes": {}})
|
|
||||||
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
|
ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
|
||||||
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, ok_resp])
|
sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
|
||||||
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
||||||
assert sync.SESSION.put_called and sync.SESSION.get_calls == 2
|
assert sync.SESSION.put_called and sync.SESSION.get_called
|
||||||
|
|
||||||
|
|
||||||
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
|
def test_kc_update_attributes_raises_without_attribute(monkeypatch):
|
||||||
sync = load_sync_module(monkeypatch)
|
sync = load_sync_module(monkeypatch)
|
||||||
current_resp = _FakeResponse({"attributes": {}})
|
|
||||||
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
|
missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
|
||||||
sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, missing_attr_resp])
|
sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
|
||||||
|
|
||||||
@ -148,25 +144,9 @@ def test_main_generates_password_and_upserts(monkeypatch):
|
|||||||
sync = load_sync_module(monkeypatch)
|
sync = load_sync_module(monkeypatch)
|
||||||
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
|
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
|
||||||
users = [
|
users = [
|
||||||
{
|
{"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
|
||||||
"id": "u1",
|
{"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
|
||||||
"username": "user1",
|
{"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
|
||||||
"email": "user1@example.com",
|
|
||||||
"attributes": {"mailu_enabled": ["true"]},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "u2",
|
|
||||||
"username": "user2",
|
|
||||||
"email": "user2@example.com",
|
|
||||||
"attributes": {"mailu_app_password": ["keepme"], "mailu_enabled": ["true"]},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "u3",
|
|
||||||
"username": "user3",
|
|
||||||
"email": "user3@example.com",
|
|
||||||
"attributes": {"mailu_email": ["user3@example.com"]},
|
|
||||||
},
|
|
||||||
{"id": "u4", "username": "user4", "email": "user4@other.com", "attributes": {}},
|
|
||||||
]
|
]
|
||||||
updated = []
|
updated = []
|
||||||
|
|
||||||
@ -205,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
|
|||||||
|
|
||||||
sync.main()
|
sync.main()
|
||||||
|
|
||||||
# Only mail-enabled users (or legacy users with a mailbox) are synced and backfilled.
|
# Always backfill mailu_email, even if Keycloak recovery email is external.
|
||||||
assert len(updated) == 3
|
assert len(updated) == 3
|
||||||
assert conns and len(conns[0]._cursor.executions) == 3
|
assert conns and len(conns[0]._cursor.executions) == 3
|
||||||
|
|||||||
@ -20,9 +20,8 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: ollama
|
app: ollama
|
||||||
annotations:
|
annotations:
|
||||||
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
|
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
||||||
ai.bstein.dev/gpu: GPU pool (titan-22/24)
|
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
|
||||||
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
|
|
||||||
spec:
|
spec:
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
@ -32,6 +31,8 @@ spec:
|
|||||||
- key: kubernetes.io/hostname
|
- key: kubernetes.io/hostname
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
- titan-22
|
- titan-22
|
||||||
- titan-24
|
- titan-24
|
||||||
runtimeClassName: nvidia
|
runtimeClassName: nvidia
|
||||||
@ -41,7 +42,7 @@ spec:
|
|||||||
claimName: ollama-models
|
claimName: ollama-models
|
||||||
initContainers:
|
initContainers:
|
||||||
- name: warm-model
|
- name: warm-model
|
||||||
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
|
image: ollama/ollama:latest
|
||||||
env:
|
env:
|
||||||
- name: OLLAMA_HOST
|
- name: OLLAMA_HOST
|
||||||
value: 0.0.0.0
|
value: 0.0.0.0
|
||||||
@ -52,7 +53,7 @@ spec:
|
|||||||
- name: OLLAMA_MODELS
|
- name: OLLAMA_MODELS
|
||||||
value: /root/.ollama
|
value: /root/.ollama
|
||||||
- name: OLLAMA_MODEL
|
- name: OLLAMA_MODEL
|
||||||
value: qwen2.5:14b-instruct-q4_0
|
value: qwen2.5-coder:7b-instruct-q4_0
|
||||||
command:
|
command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -c
|
- -c
|
||||||
@ -67,14 +68,14 @@ spec:
|
|||||||
mountPath: /root/.ollama
|
mountPath: /root/.ollama
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 500m
|
cpu: 250m
|
||||||
memory: 2Gi
|
memory: 1Gi
|
||||||
nvidia.com/gpu.shared: 1
|
nvidia.com/gpu.shared: 1
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu.shared: 1
|
nvidia.com/gpu.shared: 1
|
||||||
containers:
|
containers:
|
||||||
- name: ollama
|
- name: ollama
|
||||||
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
|
image: ollama/ollama:latest
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
@ -95,10 +96,10 @@ spec:
|
|||||||
mountPath: /root/.ollama
|
mountPath: /root/.ollama
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "4"
|
cpu: "2"
|
||||||
memory: 16Gi
|
memory: 8Gi
|
||||||
nvidia.com/gpu.shared: 1
|
nvidia.com/gpu.shared: 1
|
||||||
limits:
|
limits:
|
||||||
cpu: "8"
|
cpu: "4"
|
||||||
memory: 24Gi
|
memory: 12Gi
|
||||||
nvidia.com/gpu.shared: 1
|
nvidia.com/gpu.shared: 1
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user