Compare commits
414 Commits
main
...
feature/at
| Author | SHA1 | Date | |
|---|---|---|---|
| 6e4cafa3df | |||
| 41021c472b | |||
| 17afb0bb55 | |||
| 1e0e73a28f | |||
| af01a620c3 | |||
| 0edc513e2e | |||
| 3659c9c07b | |||
| 11d58dccb7 | |||
| 5bcff5f405 | |||
| f5dcea860e | |||
| a1e90f4600 | |||
| f04f032721 | |||
| 083999c84c | |||
| dc62a84e2e | |||
| 31ffaedf2a | |||
| b2d1dc4e3f | |||
|
|
271a941d89 | ||
|
|
fa30a2cade | ||
| f71d0bc3f3 | |||
|
|
19a3207eac | ||
| 2d5107f7e2 | |||
| a091ea75a3 | |||
| 95dabf5df8 | |||
|
|
311cec8adf | ||
|
|
b18e355412 | ||
|
|
80057210fc | ||
|
|
7a1e99a95e | ||
|
|
ace86ad736 | ||
|
|
2a4deb6dd1 | ||
|
|
eee5456921 | ||
| f86d3a4c00 | |||
| a6b77c68f0 | |||
| 9599b4c975 | |||
| df96c06fa2 | |||
| e575e6cb1e | |||
|
|
bca66c5d71 | ||
| b2affe091d | |||
|
|
6c7f2112c2 | ||
| a4874163ec | |||
| 079f8efbb9 | |||
| 95228b75ab | |||
| 9e75bf0b42 | |||
| b2841985ef | |||
| 9553995ba5 | |||
| e840777668 | |||
| 718a1ca312 | |||
| 55f0347b70 | |||
| f77e13b2cb | |||
| fd2b10d00d | |||
| 4209299a40 | |||
| 1804ff06c6 | |||
| 4b5913827d | |||
| 80548a2e82 | |||
|
|
29756b1e62 | ||
| 4bc91c40f6 | |||
| 1260d18cdf | |||
| 47efd0be06 | |||
|
|
fa410c8f1e | ||
| 0ed75718c2 | |||
| 50ff59a33b | |||
|
|
9d9bcd1988 | ||
|
|
c96749bab6 | ||
| 5e239accbd | |||
|
|
c50298c8fe | ||
|
|
3fcab34b7d | ||
| e223ef8e76 | |||
| 7f72683242 | |||
| eeb8475848 | |||
| 839b79696c | |||
| 920f146efb | |||
|
|
c2c5474bc8 | ||
|
|
eab7ed5cff | ||
|
|
22eb1a1159 | ||
| d7c1ecd098 | |||
|
|
96288c9fdd | ||
|
|
a71bf7d9d5 | ||
| 533baa6d0c | |||
|
|
cee353e305 | ||
|
|
436d24ea70 | ||
|
|
6fb80e37e8 | ||
|
|
132e73100f | ||
|
|
fe8cc40903 | ||
|
|
947a43e630 | ||
|
|
31679b59f5 | ||
|
|
77b81e1e9a | ||
|
|
6523e45b3f | ||
|
|
49414c6cca | ||
|
|
6efa280e9d | ||
|
|
ff81cfdb82 | ||
|
|
c4b0250321 | ||
|
|
c1a8aa43d6 | ||
|
|
0275adb5b7 | ||
|
|
663143660b | ||
|
|
cb25cf7571 | ||
|
|
33127dde26 | ||
|
|
dc214cee79 | ||
|
|
4395986b0c | ||
|
|
fba7fe9029 | ||
|
|
8ecc8dd548 | ||
|
|
672a559e52 | ||
|
|
0dedf4083e | ||
|
|
bf8b99e365 | ||
|
|
a33ad1c073 | ||
|
|
be90638fac | ||
|
|
3bc6d29f54 | ||
|
|
4e88c55e57 | ||
|
|
b8c94d5870 | ||
|
|
7f83d2f936 | ||
|
|
d42aa42d8a | ||
|
|
86f512fa1a | ||
|
|
16e2b19ea9 | ||
|
|
a1cb07c6d6 | ||
|
|
558d24ad6b | ||
|
|
160218a4ae | ||
|
|
2e361e620e | ||
|
|
fcd0ea9872 | ||
|
|
75826b0e5e | ||
|
|
71ddd03899 | ||
|
|
2d3a0b0184 | ||
|
|
c7fb848a62 | ||
|
|
c643c965b8 | ||
|
|
618be5ce01 | ||
|
|
ac049e6bb9 | ||
|
|
50108afc57 | ||
|
|
1f74a29445 | ||
|
|
08bc5f4b82 | ||
|
|
c208314506 | ||
|
|
763e5ff9e9 | ||
|
|
5ecb42cfef | ||
|
|
102d8e56ff | ||
|
|
ac96c5482f | ||
|
|
71aa60c696 | ||
|
|
d7582da21b | ||
|
|
4bf3773eb3 | ||
|
|
895ea49dc5 | ||
|
|
f355f6dd6a | ||
| 9f87e61f4a | |||
|
|
9a2890c45c | ||
|
|
ad74a45e76 | ||
| fda4860d67 | |||
| 9f8a0f94d2 | |||
| 51d12791ca | |||
| 9fb36f23cd | |||
|
|
1a2fe05808 | ||
|
|
0c5ec895ee | ||
| 7c87e177e9 | |||
|
|
5e6d2a938f | ||
|
|
09070c2cc6 | ||
|
|
5dd30d8802 | ||
|
|
f302cb2448 | ||
| c0a231fd91 | |||
|
|
87f8a6d2c0 | ||
|
|
78a0867215 | ||
| b0da9080c7 | |||
| 8e3feeeaac | |||
| 6f2ecdb364 | |||
| a5e168e55f | |||
|
|
87dc1209b1 | ||
| f86845053e | |||
|
|
c04c5ab048 | ||
|
|
ec3bdb7225 | ||
|
|
4b68809bb9 | ||
|
|
661bc6ac7d | ||
| a9ee943344 | |||
| 826df7d960 | |||
|
|
8dfe124212 | ||
|
|
a3bef857f9 | ||
|
|
ed766d7a02 | ||
| 4295913056 | |||
|
|
e3dfa2c0ea | ||
|
|
6bf8181677 | ||
| d67f3d6fca | |||
|
|
41a0363fbc | ||
| a609e230f2 | |||
|
|
37342bfe4a | ||
| a509354067 | |||
|
|
fb14516674 | ||
| 60c80cc86f | |||
|
|
7b8ea36554 | ||
| 49224375a0 | |||
| 7d7ddd52dc | |||
| cd7043c7f1 | |||
| fb82a038e9 | |||
| 93bcea5893 | |||
| 0ba8578416 | |||
| 86475b8bdf | |||
| f19eaf3b6b | |||
|
|
e537180f1f | ||
|
|
8298ed5c16 | ||
|
|
152a28bd09 | ||
| 7e02cccbe8 | |||
|
|
e60b1594c0 | ||
|
|
87b2b37918 | ||
|
|
a1249b3e00 | ||
| 5000d1f76b | |||
|
|
584625b893 | ||
| 95f4ecc4e0 | |||
| 240e04f9a2 | |||
| 449b8fed64 | |||
|
|
f6d655bb0c | ||
| 4fa1b6e84c | |||
| 168efd78f7 | |||
| e0bd11fa57 | |||
| 3f43299c92 | |||
| 645790f404 | |||
| f11f6a4e62 | |||
|
|
c559253a31 | ||
|
|
a3619ce215 | ||
|
|
398fb7b797 | ||
| b30e6af95d | |||
|
|
4fd79b4708 | ||
| f23da3aea5 | |||
|
|
d951ae5061 | ||
| dfe9916e91 | |||
|
|
036c758547 | ||
| 382a6e49ee | |||
| 93e7449509 | |||
| 58d1c168ff | |||
|
|
889400cdbf | ||
|
|
e06066a327 | ||
| 138f8c4407 | |||
| 33569aff99 | |||
| 3e2f56da7d | |||
|
|
0914ba3509 | ||
|
|
865a979424 | ||
|
|
5dfc3ed259 | ||
| b479364017 | |||
|
|
00d8f852a3 | ||
|
|
2d7f744284 | ||
| 5f1b1a6cd0 | |||
|
|
e966961dbe | ||
| 7ffb0aba5d | |||
|
|
e80a439725 | ||
|
|
8a22825796 | ||
| 1fabd4ce2f | |||
| 759ac5ef90 | |||
|
|
bc971cce92 | ||
|
|
069f6b4983 | ||
| 64cfd5180d | |||
|
|
8a087fb16d | ||
|
|
652c3a28a3 | ||
|
|
141c54ccf3 | ||
|
|
0f8529c7c5 | ||
|
|
dafba36768 | ||
| 4d5e9552e3 | |||
| ddf1d41fd3 | |||
|
|
49e630f7fd | ||
|
|
b7a81d28d1 | ||
| 109c00bc3c | |||
|
|
c9ad055b4c | ||
| 10498c659b | |||
|
|
978bd8e595 | ||
| 259552ac28 | |||
|
|
7f2ded5244 | ||
| e4c370b983 | |||
|
|
7dfc98b6d6 | ||
| cb60c64bce | |||
|
|
091f095893 | ||
| 5b389d12df | |||
|
|
ae88bc8484 | ||
| 529576e082 | |||
|
|
a7ffaa3213 | ||
|
|
e478f1c74d | ||
| 2480b6cecc | |||
| bbe27f963d | |||
|
|
c5da854cef | ||
| 0319707fff | |||
| 4f8d8f1f25 | |||
| 5448ff3f55 | |||
| b6c2d1416e | |||
|
|
152e1d88f4 | ||
| 86e9dc289f | |||
|
|
c4b7198c46 | ||
| f8a12be2ec | |||
|
|
c9ec5126cd | ||
|
|
c66db7c18f | ||
|
|
de47ab76a5 | ||
| c788512d59 | |||
|
|
ae25ccb6f2 | ||
|
|
e27f4cfc68 | ||
| 50e06b4a13 | |||
| 934d6e7a3b | |||
|
|
25654a731e | ||
| 4aecadb3de | |||
| 3b79a82c71 | |||
|
|
04b263dc2d | ||
| 93841d9de7 | |||
| bb294c6d21 | |||
|
|
64962f8863 | ||
| bcb4c05b14 | |||
|
|
d00a09fb58 | ||
|
|
a22ff047f7 | ||
|
|
fef5d7d26a | ||
| fa60fa124c | |||
| 30c1192978 | |||
| 644be2c575 | |||
| 29d1bf9f4e | |||
| 9bdab331b6 | |||
| 8f49ac2d63 | |||
|
|
43b9cd27ed | ||
| 580ac4950b | |||
|
|
d677e83423 | ||
|
|
bff55a6dc7 | ||
|
|
0465658ba7 | ||
|
|
3e484ba726 | ||
|
|
088bb3b435 | ||
|
|
e81bad9d47 | ||
| 3f11a065a3 | |||
|
|
ec6375f31d | ||
|
|
5a8360ed97 | ||
|
|
9e75f82d43 | ||
|
|
7ac26eb0dd | ||
| 00d2f6a61f | |||
|
|
687ca2c22d | ||
| 52281ca2ec | |||
|
|
8850e9fdf1 | ||
| a253993451 | |||
|
|
aeff2bbe73 | ||
| 39616b2435 | |||
|
|
b3d8674499 | ||
| 3ca0fb352d | |||
| f7ea7d57e9 | |||
|
|
a418844f61 | ||
| 96d914d02c | |||
| e6c031829a | |||
| ebfb19c34e | |||
| 4fedec3999 | |||
| 55f78f2eb7 | |||
| ab5ef933d8 | |||
| 3e23109229 | |||
| d18c06ad31 | |||
| 292a6b7e04 | |||
|
|
d7fd5682f3 | ||
| bedab04b22 | |||
| 6d7a32ce11 | |||
| 87ded58aca | |||
|
|
5f30ab73bf | ||
|
|
3f2d2e5fdb | ||
|
|
f55e9a6043 | ||
|
|
7de15db57a | ||
|
|
265f809f8f | ||
|
|
e4d19fc5b4 | ||
|
|
d10eace338 | ||
| 78afc97db2 | |||
|
|
3c0d4d0f4f | ||
|
|
d73d6d7c01 | ||
|
|
af02ee7abf | ||
| 630a596cb6 | |||
|
|
d2729138b6 | ||
| a6fbcc8669 | |||
|
|
d91d632496 | ||
|
|
3a9949a24d | ||
| b045506516 | |||
|
|
3f24de03d1 | ||
|
|
a3ffcb2ea1 | ||
|
|
314a922109 | ||
|
|
2ed4762fab | ||
| 1c6d572559 | |||
|
|
58cc15a7e0 | ||
|
|
3da28531fd | ||
|
|
58f818cebc | ||
|
|
cff7ec922e | ||
|
|
a49f0580da | ||
|
|
10d4f015b2 | ||
|
|
669849b883 | ||
|
|
9ce9470677 | ||
| c3555d59f7 | |||
| 28af553498 | |||
| d42385de3e | |||
| 6104035474 | |||
| dabf043ce6 | |||
| 9b8ef436c8 | |||
| 8cf24a6c96 | |||
| 2797464b45 | |||
| 320cf901ba | |||
| 5bb0fc126e | |||
| 1b8271ed61 | |||
| fab030e9c0 | |||
| be6b65cedb | |||
| cbed39bd64 | |||
| 445622e936 | |||
| 17e28d2891 | |||
| 8325827c41 | |||
| 7c7ed38ead | |||
| 5d2fb32ff8 | |||
|
|
b62a5ba3fb | ||
| 359445ab43 | |||
| 4d1382cfc9 | |||
| b66c7de5fd | |||
| 3d4e5bdde1 | |||
| f37baf2447 | |||
| ad3d8d75c9 | |||
| 4ecfdcef7c | |||
|
|
63ae3e3f6f | ||
| eab2ce50b1 | |||
|
|
523db13be0 | ||
| 6a3f8cffe1 | |||
| 80a0f424cd | |||
| 8e9d85ccd7 | |||
| 85abd589d4 | |||
|
|
bfbd707293 | ||
|
|
526a895775 | ||
| 38e1eba112 | |||
|
|
f9e6cabe6d | ||
| 36bb695c15 | |||
|
|
b449b65244 | ||
| 1a9651914e | |||
|
|
9e5be20983 | ||
| d55bc98bbe | |||
|
|
46d677f5e7 | ||
| ef63b0f9f3 | |||
| 111ae84255 | |||
| d78a3c2550 | |||
| fb89158622 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,6 +2,7 @@
|
||||
!README.md
|
||||
!knowledge/**/*.md
|
||||
!services/comms/knowledge/**/*.md
|
||||
!services/atlasbot/knowledge/**/*.md
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(atlasbot): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/atlasbot
|
||||
@ -0,0 +1,17 @@
|
||||
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
path: ./services/atlasbot
|
||||
targetNamespace: ai
|
||||
timeout: 2m
|
||||
dependsOn:
|
||||
- name: ai-llm
|
||||
@ -13,14 +13,14 @@ spec:
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(bstein-dev-home): automated image update"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/bstein-dev-home
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: comms
|
||||
namespace: comms
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(comms): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/comms
|
||||
@ -6,6 +6,9 @@ resources:
|
||||
- vault/kustomization.yaml
|
||||
- vaultwarden/kustomization.yaml
|
||||
- comms/kustomization.yaml
|
||||
- comms/image-automation.yaml
|
||||
- atlasbot/kustomization.yaml
|
||||
- atlasbot/image-automation.yaml
|
||||
- crypto/kustomization.yaml
|
||||
- monerod/kustomization.yaml
|
||||
- pegasus/kustomization.yaml
|
||||
|
||||
@ -9,7 +9,7 @@ metadata:
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
secretRef:
|
||||
name: flux-system-gitea
|
||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
|
||||
@ -16,5 +16,6 @@ resources:
|
||||
- longhorn/kustomization.yaml
|
||||
- longhorn-ui/kustomization.yaml
|
||||
- postgres/kustomization.yaml
|
||||
- nats/kustomization.yaml
|
||||
- ../platform/vault-csi/kustomization.yaml
|
||||
- ../platform/vault-injector/kustomization.yaml
|
||||
|
||||
@ -13,14 +13,14 @@ spec:
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(maintenance): automated image update"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/maintenance
|
||||
|
||||
21
clusters/atlas/flux-system/platform/nats/kustomization.yaml
Normal file
21
clusters/atlas/flux-system/platform/nats/kustomization.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./infrastructure/nats
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: nats
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
name: nats
|
||||
namespace: nats
|
||||
wait: true
|
||||
3
dockerfiles/Dockerfile.synapse-admin-ensure
Normal file
3
dockerfiles/Dockerfile.synapse-admin-ensure
Normal file
@ -0,0 +1,3 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN pip install --no-cache-dir psycopg2-binary bcrypt
|
||||
@ -6,6 +6,7 @@ resources:
|
||||
- ../modules/profiles/atlas-ha
|
||||
- coredns-custom.yaml
|
||||
- coredns-deployment.yaml
|
||||
- longhorn-node-taints.yaml
|
||||
- ntp-sync-daemonset.yaml
|
||||
- ../sources/cert-manager/letsencrypt.yaml
|
||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||
|
||||
40
infrastructure/core/longhorn-node-taints.yaml
Normal file
40
infrastructure/core/longhorn-node-taints.yaml
Normal file
@ -0,0 +1,40 @@
|
||||
# infrastructure/core/longhorn-node-taints.yaml
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-13
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-15
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-17
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-19
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
10
infrastructure/longhorn/core/backup-target.yaml
Normal file
10
infrastructure/longhorn/core/backup-target.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# infrastructure/longhorn/core/backup-target.yaml
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: BackupTarget
|
||||
metadata:
|
||||
name: default
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
backupTargetURL: "s3://atlas-soteria@us-west-004/"
|
||||
credentialSecret: longhorn-backup-b2
|
||||
pollInterval: 5m0s
|
||||
@ -6,6 +6,39 @@ metadata:
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
interval: 30m
|
||||
postRenderers:
|
||||
- kustomize:
|
||||
patches:
|
||||
- target:
|
||||
kind: Service
|
||||
name: longhorn-conversion-webhook
|
||||
namespace: longhorn-system
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/publishNotReadyAddresses
|
||||
value: true
|
||||
- target:
|
||||
kind: Service
|
||||
name: longhorn-admission-webhook
|
||||
namespace: longhorn-system
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/publishNotReadyAddresses
|
||||
value: true
|
||||
- target:
|
||||
kind: DaemonSet
|
||||
name: longhorn-manager
|
||||
namespace: longhorn-system
|
||||
patch: |
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/readinessProbe/httpGet/path
|
||||
value: /v1/healthz
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/readinessProbe/httpGet/port
|
||||
value: 9500
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/readinessProbe/httpGet/scheme
|
||||
value: HTTP
|
||||
chart:
|
||||
spec:
|
||||
chart: longhorn
|
||||
@ -34,7 +67,7 @@ spec:
|
||||
createSecret: false
|
||||
registrySecret: longhorn-registry
|
||||
image:
|
||||
pullPolicy: Always
|
||||
pullPolicy: IfNotPresent
|
||||
longhorn:
|
||||
engine:
|
||||
repository: registry.bstein.dev/infra/longhorn-engine
|
||||
@ -77,4 +110,4 @@ spec:
|
||||
repository: registry.bstein.dev/infra/longhorn-livenessprobe
|
||||
tag: v2.16.0
|
||||
defaultSettings:
|
||||
systemManagedPodsImagePullPolicy: Always
|
||||
systemManagedPodsImagePullPolicy: IfNotPresent
|
||||
|
||||
@ -6,6 +6,7 @@ resources:
|
||||
- vault-serviceaccount.yaml
|
||||
- secretproviderclass.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- backup-target.yaml
|
||||
- helmrelease.yaml
|
||||
- longhorn-settings-ensure-job.yaml
|
||||
|
||||
|
||||
@ -13,9 +13,27 @@ spec:
|
||||
- objectName: "harbor-pull__dockerconfigjson"
|
||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
||||
secretKey: "dockerconfigjson"
|
||||
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_ACCESS_KEY_ID"
|
||||
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_SECRET_ACCESS_KEY"
|
||||
- objectName: "longhorn_backup__AWS_ENDPOINTS"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_ENDPOINTS"
|
||||
secretObjects:
|
||||
- secretName: longhorn-registry
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
- objectName: harbor-pull__dockerconfigjson
|
||||
key: .dockerconfigjson
|
||||
- secretName: longhorn-backup-b2
|
||||
type: Opaque
|
||||
data:
|
||||
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
|
||||
key: AWS_ACCESS_KEY_ID
|
||||
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
|
||||
key: AWS_SECRET_ACCESS_KEY
|
||||
- objectName: longhorn_backup__AWS_ENDPOINTS
|
||||
key: AWS_ENDPOINTS
|
||||
|
||||
17
infrastructure/nats/configmap.yaml
Normal file
17
infrastructure/nats/configmap.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: nats-config
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
component: config
|
||||
annotations:
|
||||
description: "NATS JetStream configuration"
|
||||
data:
|
||||
nats.conf: |
|
||||
jetstream {
|
||||
store_dir: /data
|
||||
max_mem_store: 128MB
|
||||
max_file_store: 1GB
|
||||
}
|
||||
7
infrastructure/nats/kustomization.yaml
Normal file
7
infrastructure/nats/kustomization.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- configmap.yaml
|
||||
- service.yaml
|
||||
- statefulset.yaml
|
||||
4
infrastructure/nats/namespace.yaml
Normal file
4
infrastructure/nats/namespace.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: nats
|
||||
17
infrastructure/nats/service.yaml
Normal file
17
infrastructure/nats/service.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
selector:
|
||||
app: nats
|
||||
ports:
|
||||
- name: client
|
||||
port: 4222
|
||||
targetPort: 4222
|
||||
- name: monitoring
|
||||
port: 8222
|
||||
targetPort: 8222
|
||||
54
infrastructure/nats/statefulset.yaml
Normal file
54
infrastructure/nats/statefulset.yaml
Normal file
@ -0,0 +1,54 @@
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
serviceName: nats
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nats
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
containers:
|
||||
- name: nats
|
||||
image: nats:2.10.18
|
||||
args:
|
||||
- "-c"
|
||||
- "/etc/nats/nats.conf"
|
||||
ports:
|
||||
- name: client
|
||||
containerPort: 4222
|
||||
- name: monitoring
|
||||
containerPort: 8222
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/nats
|
||||
- name: data
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: nats-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
|
||||
}
|
||||
|
||||
NAMESPACE_CPU_WINDOW = "1m"
|
||||
GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cluster metadata
|
||||
@ -235,13 +236,16 @@ def gpu_util_by_hostname():
|
||||
|
||||
|
||||
def gpu_node_labels():
|
||||
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
|
||||
return (
|
||||
f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
|
||||
' or kube_node_labels{label_jetson="true"}'
|
||||
)
|
||||
|
||||
|
||||
def gpu_requests_by_namespace_node(scope_var):
|
||||
return (
|
||||
"sum by (namespace,node) ("
|
||||
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
||||
f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
|
||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
||||
f"* on(node) group_left() ({gpu_node_labels()})"
|
||||
")"
|
||||
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
return (
|
||||
"sum by (namespace) ("
|
||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
||||
f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
|
||||
f"* on(node) group_left() ({gpu_util_by_node()})"
|
||||
")"
|
||||
)
|
||||
@ -419,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
||||
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
||||
)
|
||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
||||
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
||||
ARIADNE_TEST_SUCCESS_RATE = (
|
||||
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
|
||||
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
|
||||
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
|
||||
TEST_SUCCESS_RATE = (
|
||||
"100 * "
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
|
||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
|
||||
"/ clamp_min("
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
|
||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
|
||||
)
|
||||
ARIADNE_TEST_FAILURES_24H = (
|
||||
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
||||
TEST_FAILURES_24H = (
|
||||
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
|
||||
)
|
||||
POSTGRES_CONN_USED = (
|
||||
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
||||
@ -1290,23 +1295,25 @@ def build_overview():
|
||||
},
|
||||
}
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
test_success = timeseries_panel(
|
||||
42,
|
||||
"Ariadne Test Success Rate",
|
||||
ARIADNE_TEST_SUCCESS_RATE,
|
||||
"Platform Test Success Rate",
|
||||
TEST_SUCCESS_RATE,
|
||||
{"h": 6, "w": 6, "x": 12, "y": 14},
|
||||
unit="percent",
|
||||
max_value=100,
|
||||
legend=None,
|
||||
legend_display="list",
|
||||
)
|
||||
test_success["description"] = (
|
||||
"Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
|
||||
"Add new test series there first so they roll up here."
|
||||
)
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
panels.append(test_success)
|
||||
test_failures = bargauge_panel(
|
||||
43,
|
||||
"Tests with Failures (24h)",
|
||||
ARIADNE_TEST_FAILURES_24H,
|
||||
"Platform Tests with Failures (24h)",
|
||||
TEST_FAILURES_24H,
|
||||
{"h": 6, "w": 6, "x": 18, "y": 14},
|
||||
unit="none",
|
||||
instant=True,
|
||||
@ -1331,7 +1338,10 @@ def build_overview():
|
||||
],
|
||||
},
|
||||
)
|
||||
test_failures["description"] = (
|
||||
"This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
||||
)
|
||||
panels.append(test_failures)
|
||||
|
||||
cpu_scope = "$namespace_scope_cpu"
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
@ -2649,29 +2659,31 @@ def build_jobs_dashboard():
|
||||
legend="{{status}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
coverage_panel = stat_panel(
|
||||
17,
|
||||
"Ariadne CI Coverage (%)",
|
||||
ARIADNE_CI_COVERAGE,
|
||||
"Platform CI Coverage (%)",
|
||||
TEST_CI_COVERAGE,
|
||||
{"h": 6, "w": 4, "x": 8, "y": 11},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
instant=True,
|
||||
legend="{{branch}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups."
|
||||
panels.append(coverage_panel)
|
||||
tests_panel = table_panel(
|
||||
18,
|
||||
"Ariadne CI Tests (latest)",
|
||||
ARIADNE_CI_TESTS,
|
||||
"Platform CI Tests (latest)",
|
||||
TEST_CI_TESTS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 11},
|
||||
unit="none",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
||||
instant=True,
|
||||
)
|
||||
tests_panel["description"] = (
|
||||
"Atlas Overview test panels depend on these internal repo-tagged CI series."
|
||||
)
|
||||
panels.append(tests_panel)
|
||||
|
||||
return {
|
||||
"uid": "atlas-jobs",
|
||||
|
||||
@ -539,9 +539,9 @@ def main() -> int:
|
||||
help="Write generated files (otherwise just print a summary).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--sync-comms",
|
||||
"--sync-atlasbot",
|
||||
action="store_true",
|
||||
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
|
||||
help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
@ -632,10 +632,10 @@ def main() -> int:
|
||||
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
|
||||
|
||||
if args.sync_comms:
|
||||
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
|
||||
_sync_tree(out_dir, comms_dir)
|
||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
|
||||
if args.sync_atlasbot:
|
||||
atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
|
||||
_sync_tree(out_dir, atlasbot_dir)
|
||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
labels:
|
||||
app: atlasbot
|
||||
spec:
|
||||
@ -18,7 +18,7 @@ spec:
|
||||
annotations:
|
||||
checksum/atlasbot-configmap: manual-atlasbot-101
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "comms"
|
||||
vault.hashicorp.com/role: "ai"
|
||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||
vault.hashicorp.com/agent-inject-template-turn-secret: |
|
||||
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
|
||||
@ -28,6 +28,15 @@ spec:
|
||||
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-seeder-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
|
||||
@ -58,17 +67,17 @@ spec:
|
||||
hardware: rpi5
|
||||
containers:
|
||||
- name: atlasbot
|
||||
image: python:3.11-slim
|
||||
image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
|
||||
command: ["/bin/sh","-c"]
|
||||
args:
|
||||
- |
|
||||
. /vault/scripts/comms_vault_env.sh
|
||||
exec python /app/bot.py
|
||||
. /vault/scripts/atlasbot_vault_env.sh
|
||||
exec python -m atlasbot.main
|
||||
env:
|
||||
- name: MATRIX_BASE
|
||||
value: http://othrys-synapse-matrix-synapse:8008
|
||||
value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
|
||||
- name: AUTH_BASE
|
||||
value: http://matrix-authentication-service:8080
|
||||
value: http://matrix-authentication-service.comms.svc.cluster.local:8080
|
||||
- name: KB_DIR
|
||||
value: /kb
|
||||
- name: VM_URL
|
||||
@ -76,27 +85,69 @@ spec:
|
||||
- name: ARIADNE_STATE_URL
|
||||
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_QUICK
|
||||
value: atlas-quick
|
||||
- name: BOT_USER_SMART
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_GENIUS
|
||||
value: atlas-genius
|
||||
- name: BOT_MENTIONS
|
||||
value: atlasbot,aatlasbot,atlas_quick,atlas_smart
|
||||
value: atlas-quick,atlas-smart,atlas-genius
|
||||
- name: OLLAMA_URL
|
||||
value: http://ollama.ai.svc.cluster.local:11434
|
||||
- name: OLLAMA_MODEL
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: ATLASBOT_MODEL_FAST
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_DEEP
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: ATLASBOT_MODEL_FAST
|
||||
value: qwen2.5-coder:7b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_SMART
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_GENIUS
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: OLLAMA_FALLBACK_MODEL
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: OLLAMA_TIMEOUT_SEC
|
||||
value: "600"
|
||||
- name: OLLAMA_RETRIES
|
||||
value: "0"
|
||||
- name: ATLASBOT_THINKING_INTERVAL_SEC
|
||||
value: "120"
|
||||
value: "30"
|
||||
- name: ATLASBOT_QUICK_TIME_BUDGET_SEC
|
||||
value: "15"
|
||||
- name: ATLASBOT_SMART_TIME_BUDGET_SEC
|
||||
value: "45"
|
||||
- name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
|
||||
value: "180"
|
||||
- name: ATLASBOT_SNAPSHOT_TTL_SEC
|
||||
value: "30"
|
||||
- name: ATLASBOT_HTTP_PORT
|
||||
value: "8090"
|
||||
- name: ATLASBOT_STATE_DB
|
||||
value: /data/atlasbot_state.db
|
||||
- name: ATLASBOT_QUEUE_ENABLED
|
||||
value: "false"
|
||||
- name: ATLASBOT_DEBUG_PIPELINE
|
||||
value: "true"
|
||||
- name: ATLASBOT_NATS_URL
|
||||
value: nats://nats.nats.svc.cluster.local:4222
|
||||
- name: ATLASBOT_NATS_STREAM
|
||||
value: atlasbot
|
||||
- name: ATLASBOT_NATS_SUBJECT
|
||||
value: atlasbot.requests
|
||||
- name: ATLASBOT_FAST_MAX_ANGLES
|
||||
value: "2"
|
||||
- name: ATLASBOT_SMART_MAX_ANGLES
|
||||
value: "5"
|
||||
- name: ATLASBOT_FAST_MAX_CANDIDATES
|
||||
value: "2"
|
||||
- name: ATLASBOT_SMART_MAX_CANDIDATES
|
||||
value: "6"
|
||||
- name: ATLASBOT_FAST_LLM_CALLS_MAX
|
||||
value: "8"
|
||||
- name: ATLASBOT_SMART_LLM_CALLS_MAX
|
||||
value: "24"
|
||||
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
|
||||
value: "72"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8090
|
||||
@ -108,19 +159,15 @@ spec:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- name: code
|
||||
mountPath: /app/bot.py
|
||||
subPath: bot.py
|
||||
- name: kb
|
||||
mountPath: /kb
|
||||
readOnly: true
|
||||
- name: vault-scripts
|
||||
mountPath: /vault/scripts
|
||||
readOnly: true
|
||||
- name: atlasbot-state
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: code
|
||||
configMap:
|
||||
name: atlasbot
|
||||
- name: kb
|
||||
configMap:
|
||||
name: atlas-kb
|
||||
@ -139,5 +186,7 @@ spec:
|
||||
path: diagrams/atlas-http.mmd
|
||||
- name: vault-scripts
|
||||
configMap:
|
||||
name: comms-vault-env
|
||||
name: atlasbot-vault-env
|
||||
defaultMode: 0555
|
||||
- name: atlasbot-state
|
||||
emptyDir: {}
|
||||
@ -3,7 +3,9 @@ apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
imagePullSecrets:
|
||||
- name: harbor-regcred
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
@ -43,5 +45,4 @@ roleRef:
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
|
||||
namespace: ai
|
||||
@ -2,7 +2,7 @@ apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
labels:
|
||||
app: atlasbot
|
||||
spec:
|
||||
26
services/atlasbot/image-automation.yaml
Normal file
26
services/atlasbot/image-automation.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
# services/atlasbot/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
name: flux-bot
|
||||
email: ops@bstein.dev
|
||||
messageTemplate: "chore(atlasbot): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
path: services/atlasbot
|
||||
strategy: Setters
|
||||
23
services/atlasbot/image.yaml
Normal file
23
services/atlasbot/image.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
# services/comms/image.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/atlasbot
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: atlasbot
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
22
services/atlasbot/knowledge/INDEX.md
Normal file
22
services/atlasbot/knowledge/INDEX.md
Normal file
@ -0,0 +1,22 @@
|
||||
Atlas Knowledge Base (KB)
|
||||
|
||||
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
|
||||
- Accurate (grounded in GitOps + read-only cluster tools)
|
||||
- Maintainable (small docs + deterministic generators)
|
||||
- Safe (no secrets; refer to Secret/Vault paths by name only)
|
||||
|
||||
Layout
|
||||
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
|
||||
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
|
||||
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
|
||||
|
||||
Regeneration
|
||||
- Update manifests/docs, then regenerate generated artifacts:
|
||||
- `python scripts/knowledge_render_atlas.py --write`
|
||||
|
||||
Authoring rules
|
||||
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
|
||||
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
|
||||
- Keep each runbook small; one topic per file; use headings.
|
||||
- When in doubt, link to the exact file path in this repo that configures the behavior.
|
||||
|
||||
8
services/atlasbot/knowledge/catalog/atlas-summary.json
Normal file
8
services/atlasbot/knowledge/catalog/atlas-summary.json
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"counts": {
|
||||
"helmrelease_host_hints": 19,
|
||||
"http_endpoints": 45,
|
||||
"services": 47,
|
||||
"workloads": 74
|
||||
}
|
||||
}
|
||||
3445
services/atlasbot/knowledge/catalog/atlas.json
Normal file
3445
services/atlasbot/knowledge/catalog/atlas.json
Normal file
File diff suppressed because it is too large
Load Diff
1880
services/atlasbot/knowledge/catalog/metrics.json
Normal file
1880
services/atlasbot/knowledge/catalog/metrics.json
Normal file
File diff suppressed because it is too large
Load Diff
97
services/atlasbot/knowledge/catalog/runbooks.json
Normal file
97
services/atlasbot/knowledge/catalog/runbooks.json
Normal file
File diff suppressed because one or more lines are too long
234
services/atlasbot/knowledge/diagrams/atlas-http.mmd
Normal file
234
services/atlasbot/knowledge/diagrams/atlas-http.mmd
Normal file
@ -0,0 +1,234 @@
|
||||
flowchart LR
|
||||
host_auth_bstein_dev["auth.bstein.dev"]
|
||||
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
|
||||
host_auth_bstein_dev --> svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
|
||||
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
|
||||
host_bstein_dev["bstein.dev"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
|
||||
host_bstein_dev --> svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
|
||||
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
|
||||
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
||||
host_budget_bstein_dev["budget.bstein.dev"]
|
||||
svc_finance_actual_budget["finance/actual-budget (Service)"]
|
||||
host_budget_bstein_dev --> svc_finance_actual_budget
|
||||
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
|
||||
svc_finance_actual_budget --> wl_finance_actual_budget
|
||||
host_call_live_bstein_dev["call.live.bstein.dev"]
|
||||
svc_comms_element_call["comms/element-call (Service)"]
|
||||
host_call_live_bstein_dev --> svc_comms_element_call
|
||||
wl_comms_element_call["comms/element-call (Deployment)"]
|
||||
svc_comms_element_call --> wl_comms_element_call
|
||||
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
|
||||
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
|
||||
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
|
||||
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
|
||||
host_ci_bstein_dev["ci.bstein.dev"]
|
||||
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
|
||||
host_ci_bstein_dev --> svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
|
||||
svc_jenkins_jenkins --> wl_jenkins_jenkins
|
||||
host_cloud_bstein_dev["cloud.bstein.dev"]
|
||||
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
|
||||
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
||||
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
||||
host_health_bstein_dev["health.bstein.dev"]
|
||||
svc_health_wger["health/wger (Service)"]
|
||||
host_health_bstein_dev --> svc_health_wger
|
||||
wl_health_wger["health/wger (Deployment)"]
|
||||
svc_health_wger --> wl_health_wger
|
||||
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
||||
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
|
||||
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
|
||||
svc_comms_livekit["comms/livekit (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit
|
||||
wl_comms_livekit["comms/livekit (Deployment)"]
|
||||
svc_comms_livekit --> wl_comms_livekit
|
||||
host_live_bstein_dev["live.bstein.dev"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||
host_logs_bstein_dev["logs.bstein.dev"]
|
||||
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
|
||||
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
|
||||
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
|
||||
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
||||
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
|
||||
host_mail_bstein_dev["mail.bstein.dev"]
|
||||
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
||||
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
||||
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
host_monero_bstein_dev["monero.bstein.dev"]
|
||||
svc_crypto_monerod["crypto/monerod (Service)"]
|
||||
host_monero_bstein_dev --> svc_crypto_monerod
|
||||
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
||||
svc_crypto_monerod --> wl_crypto_monerod
|
||||
host_money_bstein_dev["money.bstein.dev"]
|
||||
svc_finance_firefly["finance/firefly (Service)"]
|
||||
host_money_bstein_dev --> svc_finance_firefly
|
||||
wl_finance_firefly["finance/firefly (Deployment)"]
|
||||
svc_finance_firefly --> wl_finance_firefly
|
||||
host_notes_bstein_dev["notes.bstein.dev"]
|
||||
svc_outline_outline["outline/outline (Service)"]
|
||||
host_notes_bstein_dev --> svc_outline_outline
|
||||
wl_outline_outline["outline/outline (Deployment)"]
|
||||
svc_outline_outline --> wl_outline_outline
|
||||
host_office_bstein_dev["office.bstein.dev"]
|
||||
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
||||
host_office_bstein_dev --> svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
|
||||
svc_nextcloud_collabora --> wl_nextcloud_collabora
|
||||
host_pegasus_bstein_dev["pegasus.bstein.dev"]
|
||||
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
|
||||
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
|
||||
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
|
||||
host_scm_bstein_dev["scm.bstein.dev"]
|
||||
svc_gitea_gitea["gitea/gitea (Service)"]
|
||||
host_scm_bstein_dev --> svc_gitea_gitea
|
||||
wl_gitea_gitea["gitea/gitea (Deployment)"]
|
||||
svc_gitea_gitea --> wl_gitea_gitea
|
||||
host_secret_bstein_dev["secret.bstein.dev"]
|
||||
svc_vault_vault["vault/vault (Service)"]
|
||||
host_secret_bstein_dev --> svc_vault_vault
|
||||
wl_vault_vault["vault/vault (StatefulSet)"]
|
||||
svc_vault_vault --> wl_vault_vault
|
||||
host_sso_bstein_dev["sso.bstein.dev"]
|
||||
svc_sso_keycloak["sso/keycloak (Service)"]
|
||||
host_sso_bstein_dev --> svc_sso_keycloak
|
||||
wl_sso_keycloak["sso/keycloak (Deployment)"]
|
||||
svc_sso_keycloak --> wl_sso_keycloak
|
||||
host_stream_bstein_dev["stream.bstein.dev"]
|
||||
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
|
||||
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
||||
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
||||
host_tasks_bstein_dev["tasks.bstein.dev"]
|
||||
svc_planka_planka["planka/planka (Service)"]
|
||||
host_tasks_bstein_dev --> svc_planka_planka
|
||||
wl_planka_planka["planka/planka (Deployment)"]
|
||||
svc_planka_planka --> wl_planka_planka
|
||||
host_vault_bstein_dev["vault.bstein.dev"]
|
||||
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
||||
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
|
||||
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
|
||||
|
||||
subgraph bstein_dev_home[bstein-dev-home]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend
|
||||
svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway
|
||||
end
|
||||
subgraph comms[comms]
|
||||
svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown
|
||||
svc_comms_element_call
|
||||
wl_comms_element_call
|
||||
svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service
|
||||
svc_comms_livekit
|
||||
wl_comms_livekit
|
||||
svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register
|
||||
svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service
|
||||
end
|
||||
subgraph crypto[crypto]
|
||||
svc_crypto_monerod
|
||||
wl_crypto_monerod
|
||||
end
|
||||
subgraph finance[finance]
|
||||
svc_finance_actual_budget
|
||||
wl_finance_actual_budget
|
||||
svc_finance_firefly
|
||||
wl_finance_firefly
|
||||
end
|
||||
subgraph gitea[gitea]
|
||||
svc_gitea_gitea
|
||||
wl_gitea_gitea
|
||||
end
|
||||
subgraph health[health]
|
||||
svc_health_wger
|
||||
wl_health_wger
|
||||
end
|
||||
subgraph jellyfin[jellyfin]
|
||||
svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus
|
||||
svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin
|
||||
end
|
||||
subgraph jenkins[jenkins]
|
||||
svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins
|
||||
end
|
||||
subgraph logging[logging]
|
||||
svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs
|
||||
end
|
||||
subgraph longhorn_system[longhorn-system]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn
|
||||
end
|
||||
subgraph mailu_mailserver[mailu-mailserver]
|
||||
svc_mailu_mailserver_mailu_front
|
||||
end
|
||||
subgraph nextcloud[nextcloud]
|
||||
svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud
|
||||
svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora
|
||||
end
|
||||
subgraph outline[outline]
|
||||
svc_outline_outline
|
||||
wl_outline_outline
|
||||
end
|
||||
subgraph planka[planka]
|
||||
svc_planka_planka
|
||||
wl_planka_planka
|
||||
end
|
||||
subgraph sso[sso]
|
||||
svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy
|
||||
svc_sso_keycloak
|
||||
wl_sso_keycloak
|
||||
end
|
||||
subgraph vault[vault]
|
||||
svc_vault_vault
|
||||
wl_vault_vault
|
||||
end
|
||||
subgraph vaultwarden[vaultwarden]
|
||||
svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden
|
||||
end
|
||||
29
services/atlasbot/kustomization.yaml
Normal file
29
services/atlasbot/kustomization.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
# services/atlasbot/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: ai
|
||||
resources:
|
||||
- atlasbot-deployment.yaml
|
||||
- atlasbot-service.yaml
|
||||
- atlasbot-rbac.yaml
|
||||
- secretproviderclass.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- image.yaml
|
||||
- image-automation.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/atlasbot
|
||||
newTag: 0.1.2-106 # {"$imagepolicy": "ai:atlasbot:tag"}
|
||||
configMapGenerator:
|
||||
- name: atlasbot-vault-env
|
||||
files:
|
||||
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlas-kb
|
||||
files:
|
||||
- INDEX.md=knowledge/INDEX.md
|
||||
- atlas.json=knowledge/catalog/atlas.json
|
||||
- atlas-summary.json=knowledge/catalog/atlas-summary.json
|
||||
- metrics.json=knowledge/catalog/metrics.json
|
||||
- runbooks.json=knowledge/catalog/runbooks.json
|
||||
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
|
||||
44
services/atlasbot/scripts/atlasbot_vault_env.sh
Normal file
44
services/atlasbot/scripts/atlasbot_vault_env.sh
Normal file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
vault_dir="/vault/secrets"
|
||||
|
||||
read_secret() {
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
}
|
||||
|
||||
read_optional() {
|
||||
if [ -f "${vault_dir}/$1" ]; then
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
else
|
||||
printf ''
|
||||
fi
|
||||
}
|
||||
|
||||
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
|
||||
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
|
||||
|
||||
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
|
||||
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
|
||||
|
||||
export BOT_PASS="$(read_secret bot-pass)"
|
||||
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
|
||||
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
|
||||
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
|
||||
if [ -z "${BOT_PASS_SMART}" ]; then
|
||||
export BOT_PASS_SMART="${BOT_PASS}"
|
||||
fi
|
||||
if [ -z "${BOT_PASS_GENIUS}" ]; then
|
||||
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
|
||||
fi
|
||||
export SEEDER_PASS="$(read_secret seeder-pass)"
|
||||
|
||||
export CHAT_API_KEY="$(read_secret chat-matrix)"
|
||||
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
|
||||
|
||||
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
|
||||
export PGPASSWORD="$(read_secret synapse-db-pass)"
|
||||
|
||||
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
|
||||
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
|
||||
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"
|
||||
21
services/atlasbot/secretproviderclass.yaml
Normal file
21
services/atlasbot/secretproviderclass.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
# services/atlasbot/secretproviderclass.yaml
|
||||
apiVersion: secrets-store.csi.x-k8s.io/v1
|
||||
kind: SecretProviderClass
|
||||
metadata:
|
||||
name: atlasbot-vault
|
||||
namespace: ai
|
||||
spec:
|
||||
provider: vault
|
||||
parameters:
|
||||
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
|
||||
roleName: "ai"
|
||||
objects: |
|
||||
- objectName: "harbor-pull__dockerconfigjson"
|
||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
||||
secretKey: "dockerconfigjson"
|
||||
secretObjects:
|
||||
- secretName: harbor-regcred
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
- objectName: harbor-pull__dockerconfigjson
|
||||
key: .dockerconfigjson
|
||||
34
services/atlasbot/vault-sync-deployment.yaml
Normal file
34
services/atlasbot/vault-sync-deployment.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
# services/atlasbot/vault-sync-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: atlasbot-vault-sync
|
||||
namespace: ai
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: atlasbot-vault-sync
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: atlasbot-vault-sync
|
||||
spec:
|
||||
serviceAccountName: atlasbot
|
||||
containers:
|
||||
- name: sync
|
||||
image: alpine:3.20
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- "sleep infinity"
|
||||
volumeMounts:
|
||||
- name: vault-secrets
|
||||
mountPath: /vault/secrets
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: vault-secrets
|
||||
csi:
|
||||
driver: secrets-store.csi.k8s.io
|
||||
readOnly: true
|
||||
volumeAttributes:
|
||||
secretProviderClass: atlasbot-vault
|
||||
@ -68,7 +68,13 @@ spec:
|
||||
- name: AI_CHAT_TIMEOUT_SEC
|
||||
value: "480"
|
||||
- name: AI_ATLASBOT_ENDPOINT
|
||||
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
|
||||
value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
|
||||
- name: AI_ATLASBOT_MODEL_FAST
|
||||
value: qwen2.5-coder:7b-instruct-q4_0
|
||||
- name: AI_ATLASBOT_MODEL_SMART
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: AI_ATLASBOT_MODEL_GENIUS
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: AI_ATLASBOT_TIMEOUT_SEC
|
||||
value: "30"
|
||||
- name: AI_NODE_NAME
|
||||
|
||||
@ -20,9 +20,9 @@ resources:
|
||||
- ingress.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
|
||||
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
|
||||
newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
|
||||
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
|
||||
newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
|
||||
configMapGenerator:
|
||||
- name: chat-ai-gateway
|
||||
namespace: bstein-dev-home
|
||||
|
||||
@ -13,10 +13,7 @@ resources:
|
||||
- element-call-deployment.yaml
|
||||
- guest-register-deployment.yaml
|
||||
- guest-register-service.yaml
|
||||
- atlasbot-deployment.yaml
|
||||
- atlasbot-service.yaml
|
||||
- wellknown.yaml
|
||||
- atlasbot-rbac.yaml
|
||||
- mas-secrets-ensure-rbac.yaml
|
||||
- comms-secrets-ensure-rbac.yaml
|
||||
- mas-db-ensure-rbac.yaml
|
||||
@ -43,7 +40,6 @@ resources:
|
||||
- livekit-ingress.yaml
|
||||
- livekit-middlewares.yaml
|
||||
- matrix-ingress.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: comms-vault-env
|
||||
files:
|
||||
@ -60,21 +56,8 @@ configMapGenerator:
|
||||
- server.py=scripts/guest-register/server.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlasbot
|
||||
files:
|
||||
- bot.py=scripts/atlasbot/bot.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: othrys-element-host-config
|
||||
files:
|
||||
- 20-host-config.sh=scripts/element-host-config.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlas-kb
|
||||
files:
|
||||
- INDEX.md=knowledge/INDEX.md
|
||||
- atlas.json=knowledge/catalog/atlas.json
|
||||
- atlas-summary.json=knowledge/catalog/atlas-summary.json
|
||||
- metrics.json=knowledge/catalog/metrics.json
|
||||
- runbooks.json=knowledge/catalog/runbooks.json
|
||||
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
|
||||
|
||||
@ -7,6 +7,7 @@ metadata:
|
||||
kubernetes.io/ingress.class: traefik
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||
traefik.ingress.kubernetes.io/router.priority: "120"
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
@ -43,6 +44,13 @@ spec:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /_matrix/client/r0/login
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /_matrix/client/v3/logout
|
||||
pathType: Exact
|
||||
backend:
|
||||
@ -57,6 +65,41 @@ spec:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /account
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /authorize
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /oauth2
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /.well-known/openid-configuration
|
||||
pathType: Exact
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /.well-known/oauth-authorization-server
|
||||
pathType: Exact
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /_matrix
|
||||
pathType: Prefix
|
||||
backend:
|
||||
@ -102,6 +145,13 @@ spec:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /_matrix/client/r0/login
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: matrix-authentication-service
|
||||
port:
|
||||
number: 8080
|
||||
- path: /_matrix/client/v3/logout
|
||||
pathType: Exact
|
||||
backend:
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/comms/oneoffs/comms-secrets-ensure-job.yaml
|
||||
# One-off job for comms/comms-secrets-ensure-7.
|
||||
# Purpose: comms secrets ensure 7 (see container args/env in this file).
|
||||
# One-off job for comms/comms-secrets-ensure-8.
|
||||
# Purpose: comms secrets ensure 8 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: comms-secrets-ensure-7
|
||||
name: comms-secrets-ensure-8
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
@ -87,6 +87,9 @@ spec:
|
||||
ensure_key "comms/synapse-redis" "redis-password" >/dev/null
|
||||
ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
|
||||
|
||||
SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/comms/oneoffs/mas-local-users-ensure-job.yaml
|
||||
# One-off job for comms/mas-local-users-ensure-18.
|
||||
# One-off job for comms/mas-local-users-ensure-19.
|
||||
# Purpose: mas local users ensure 18 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: mas-local-users-ensure-18
|
||||
name: mas-local-users-ensure-19
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
@ -27,6 +27,12 @@ spec:
|
||||
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-seeder-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
|
||||
@ -92,7 +98,13 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_QUICK
|
||||
value: atlas-quick
|
||||
- name: BOT_USER_SMART
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_GENIUS
|
||||
value: atlas-genius
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
@ -225,11 +237,27 @@ spec:
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
if r.status_code == 429:
|
||||
return False
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
|
||||
return True
|
||||
|
||||
wait_for_service(MAS_ADMIN_API_BASE)
|
||||
token = admin_token()
|
||||
bot_quick = os.environ.get("BOT_USER_QUICK", "")
|
||||
bot_smart = os.environ.get("BOT_USER_SMART", "")
|
||||
bot_genius = os.environ.get("BOT_USER_GENIUS", "")
|
||||
bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
|
||||
bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
|
||||
bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
|
||||
|
||||
ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
|
||||
ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
|
||||
if bot_quick and bot_quick_pass:
|
||||
ensure_user(token, bot_quick, bot_quick_pass)
|
||||
if bot_smart and bot_smart_pass:
|
||||
ensure_user(token, bot_smart, bot_smart_pass)
|
||||
if bot_genius and bot_genius_pass:
|
||||
ensure_user(token, bot_genius, bot_genius_pass)
|
||||
PY
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
# services/comms/oneoffs/synapse-admin-ensure-job.yaml
|
||||
# One-off job for comms/synapse-admin-ensure-3.
|
||||
# Purpose: synapse admin ensure 3 (see container args/env in this file).
|
||||
# One-off job for comms/synapse-admin-ensure-15.
|
||||
# Purpose: synapse admin ensure 15 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: synapse-admin-ensure-3
|
||||
name: synapse-admin-ensure-15
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
suspend: false
|
||||
backoffLimit: 0
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
@ -32,7 +32,8 @@ spec:
|
||||
values: ["arm64"]
|
||||
containers:
|
||||
- name: ensure
|
||||
image: python:3.11-slim
|
||||
image: python:3.12-slim
|
||||
imagePullPolicy: Always
|
||||
env:
|
||||
- name: VAULT_ADDR
|
||||
value: http://vault.vault.svc.cluster.local:8200
|
||||
@ -45,22 +46,20 @@ spec:
|
||||
- -c
|
||||
- |
|
||||
set -euo pipefail
|
||||
pip install --no-cache-dir psycopg2-binary bcrypt
|
||||
python -m pip install --no-cache-dir psycopg2-binary
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
import string
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
import bcrypt
|
||||
import psycopg2
|
||||
|
||||
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
|
||||
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
|
||||
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
|
||||
PGHOST = "postgres-service.postgres.svc.cluster.local"
|
||||
PGPORT = 5432
|
||||
PGDATABASE = "synapse"
|
||||
@ -113,48 +112,15 @@ spec:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
resp.read()
|
||||
|
||||
def random_password(length: int = 32) -> str:
|
||||
alphabet = string.ascii_letters + string.digits
|
||||
return "".join(secrets.choice(alphabet) for _ in range(length))
|
||||
|
||||
def ensure_admin_creds(token: str) -> dict:
|
||||
data = vault_get(token, "comms/synapse-admin")
|
||||
username = (data.get("username") or "").strip() or "synapse-admin"
|
||||
password = (data.get("password") or "").strip()
|
||||
if not password:
|
||||
password = random_password()
|
||||
username = "othrys-seeder"
|
||||
if data.get("username") != username:
|
||||
data["username"] = username
|
||||
data["password"] = password
|
||||
data.pop("access_token", None)
|
||||
vault_put(token, "comms/synapse-admin", data)
|
||||
return data
|
||||
|
||||
def ensure_user(cur, cols, user_id, password, admin):
|
||||
now_ms = int(time.time() * 1000)
|
||||
values = {
|
||||
"name": user_id,
|
||||
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
|
||||
"creation_ts": now_ms,
|
||||
}
|
||||
|
||||
def add_flag(name, flag):
|
||||
if name not in cols:
|
||||
return
|
||||
if cols[name]["type"] in ("smallint", "integer"):
|
||||
values[name] = int(flag)
|
||||
else:
|
||||
values[name] = bool(flag)
|
||||
|
||||
add_flag("admin", admin)
|
||||
add_flag("deactivated", False)
|
||||
add_flag("shadow_banned", False)
|
||||
add_flag("is_guest", False)
|
||||
|
||||
columns = list(values.keys())
|
||||
placeholders = ", ".join(["%s"] * len(columns))
|
||||
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
|
||||
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
|
||||
cur.execute(query, [values[c] for c in columns])
|
||||
|
||||
def get_cols(cur):
|
||||
cur.execute(
|
||||
"""
|
||||
@ -172,30 +138,40 @@ spec:
|
||||
}
|
||||
return cols
|
||||
|
||||
def ensure_access_token(cur, user_id, token_value):
|
||||
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
|
||||
token_id = cur.fetchone()[0]
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
|
||||
VALUES (%s, %s, %s, %s, NULL)
|
||||
ON CONFLICT (token) DO NOTHING
|
||||
""",
|
||||
(token_id, user_id, token_value, "ariadne-admin"),
|
||||
)
|
||||
def admin_token_valid(token: str, user_id: str) -> bool:
|
||||
if not token or not SYNAPSE_ADMIN_URL:
|
||||
return False
|
||||
encoded = urllib.parse.quote(user_id, safe="")
|
||||
url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
resp.read()
|
||||
return True
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
return True
|
||||
if exc.code in (401, 403):
|
||||
return False
|
||||
raise
|
||||
|
||||
vault_token = vault_login()
|
||||
admin_data = ensure_admin_creds(vault_token)
|
||||
if admin_data.get("access_token"):
|
||||
log("synapse admin token already present")
|
||||
user_id = f"@{admin_data['username']}:live.bstein.dev"
|
||||
existing_token = admin_data.get("access_token")
|
||||
if existing_token and admin_token_valid(existing_token, user_id):
|
||||
log("synapse admin token already present and valid")
|
||||
raise SystemExit(0)
|
||||
if existing_token:
|
||||
log("synapse admin token invalid; rotating")
|
||||
admin_data.pop("access_token", None)
|
||||
vault_put(vault_token, "comms/synapse-admin", admin_data)
|
||||
|
||||
synapse_db = vault_get(vault_token, "comms/synapse-db")
|
||||
pg_password = synapse_db.get("POSTGRES_PASSWORD")
|
||||
if not pg_password:
|
||||
raise RuntimeError("synapse db password missing")
|
||||
|
||||
user_id = f"@{admin_data['username']}:live.bstein.dev"
|
||||
conn = psycopg2.connect(
|
||||
host=PGHOST,
|
||||
port=PGPORT,
|
||||
@ -203,17 +179,34 @@ spec:
|
||||
user=PGUSER,
|
||||
password=pg_password,
|
||||
)
|
||||
token_value = secrets.token_urlsafe(32)
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cols = get_cols(cur)
|
||||
ensure_user(cur, cols, user_id, admin_data["password"], True)
|
||||
ensure_access_token(cur, user_id, token_value)
|
||||
if "admin" not in cols:
|
||||
raise RuntimeError("users.admin column missing")
|
||||
cur.execute(
|
||||
"UPDATE users SET admin = TRUE WHERE name = %s",
|
||||
(user_id,),
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT token FROM access_tokens
|
||||
WHERE user_id = %s AND valid_until_ms IS NULL
|
||||
ORDER BY id DESC LIMIT 1
|
||||
""",
|
||||
(user_id,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
raise RuntimeError(f"no access token found for {user_id}")
|
||||
token_value = row[0]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
admin_data["access_token"] = token_value
|
||||
vault_put(vault_token, "comms/synapse-admin", admin_data)
|
||||
if not admin_token_valid(token_value, user_id):
|
||||
raise RuntimeError("synapse admin token validation failed")
|
||||
log("synapse admin token stored")
|
||||
PY
|
||||
|
||||
@ -82,8 +82,6 @@ spec:
|
||||
value: synapse
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
@ -141,10 +139,8 @@ spec:
|
||||
cur.execute(query, [values[c] for c in columns])
|
||||
|
||||
seeder_user = os.environ["SEEDER_USER"]
|
||||
bot_user = os.environ["BOT_USER"]
|
||||
server = "live.bstein.dev"
|
||||
seeder_id = f"@{seeder_user}:{server}"
|
||||
bot_id = f"@{bot_user}:{server}"
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGHOST"],
|
||||
@ -158,7 +154,6 @@ spec:
|
||||
with conn.cursor() as cur:
|
||||
cols = get_cols(cur)
|
||||
upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
|
||||
upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
|
||||
finally:
|
||||
conn.close()
|
||||
PY
|
||||
|
||||
@ -76,7 +76,7 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
|
||||
@ -11,14 +11,21 @@ from urllib import error, parse, request
|
||||
|
||||
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
|
||||
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
|
||||
USER = os.environ["BOT_USER"]
|
||||
PASSWORD = os.environ["BOT_PASS"]
|
||||
BOT_USER = os.environ["BOT_USER"]
|
||||
BOT_PASS = os.environ["BOT_PASS"]
|
||||
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
|
||||
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
|
||||
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
|
||||
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
|
||||
BOT_USER_GENIUS = os.environ.get("BOT_USER_GENIUS", "").strip()
|
||||
BOT_PASS_GENIUS = os.environ.get("BOT_PASS_GENIUS", "").strip()
|
||||
ROOM_ALIAS = "#othrys:live.bstein.dev"
|
||||
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
|
||||
MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
|
||||
MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
|
||||
MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
|
||||
MODEL_SMART = os.environ.get("ATLASBOT_MODEL_SMART", os.environ.get("ATLASBOT_MODEL_DEEP", "")).strip()
|
||||
MODEL_GENIUS = os.environ.get("ATLASBOT_MODEL_GENIUS", MODEL_SMART).strip()
|
||||
FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
|
||||
API_KEY = os.environ.get("CHAT_API_KEY", "")
|
||||
OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
|
||||
@ -31,7 +38,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
|
||||
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
|
||||
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
|
||||
|
||||
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
|
||||
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
|
||||
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
|
||||
|
||||
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
|
||||
@ -39,6 +46,9 @@ MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
|
||||
MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
|
||||
MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
|
||||
THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
|
||||
QUICK_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_QUICK_TIME_BUDGET_SEC", "15"))
|
||||
SMART_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_SMART_TIME_BUDGET_SEC", "45"))
|
||||
GENIUS_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_GENIUS_TIME_BUDGET_SEC", "180"))
|
||||
OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
|
||||
OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
|
||||
|
||||
@ -380,27 +390,104 @@ def _strip_bot_mention(text: str) -> str:
|
||||
return cleaned or text.strip()
|
||||
|
||||
|
||||
def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
|
||||
def _detect_mode_from_body(body: str, *, default: str = "smart") -> str:
|
||||
lower = normalize_query(body or "")
|
||||
if "atlas_quick" in lower or "atlas-quick" in lower:
|
||||
return "fast"
|
||||
if "atlas_smart" in lower or "atlas-smart" in lower:
|
||||
return "deep"
|
||||
return "smart"
|
||||
if "atlas_genius" in lower or "atlas-genius" in lower:
|
||||
return "genius"
|
||||
if lower.startswith("quick ") or lower.startswith("fast "):
|
||||
return "fast"
|
||||
if lower.startswith("smart ") or lower.startswith("deep "):
|
||||
return "deep"
|
||||
if lower.startswith("smart "):
|
||||
return "smart"
|
||||
if lower.startswith("genius ") or lower.startswith("deep "):
|
||||
return "genius"
|
||||
return default
|
||||
|
||||
|
||||
def _detect_mode(
|
||||
content: dict[str, Any],
|
||||
body: str,
|
||||
*,
|
||||
default: str = "smart",
|
||||
account_user: str = "",
|
||||
) -> str:
|
||||
mode = _detect_mode_from_body(body, default=default)
|
||||
mentions = content.get("m.mentions", {})
|
||||
user_ids = mentions.get("user_ids", [])
|
||||
if isinstance(user_ids, list):
|
||||
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
|
||||
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
|
||||
return "fast"
|
||||
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
|
||||
return "smart"
|
||||
if BOT_USER_GENIUS and normalize_user_id(BOT_USER_GENIUS).lower() in normalized:
|
||||
return "genius"
|
||||
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
|
||||
return "smart"
|
||||
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
|
||||
return "fast"
|
||||
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
|
||||
return "smart"
|
||||
if account_user and BOT_USER_GENIUS and normalize_user_id(account_user) == normalize_user_id(BOT_USER_GENIUS):
|
||||
return "genius"
|
||||
return mode
|
||||
|
||||
|
||||
def _model_for_mode(mode: str) -> str:
|
||||
if mode == "fast" and MODEL_FAST:
|
||||
return MODEL_FAST
|
||||
if mode == "deep" and MODEL_DEEP:
|
||||
return MODEL_DEEP
|
||||
if mode == "smart" and MODEL_SMART:
|
||||
return MODEL_SMART
|
||||
if mode == "genius" and MODEL_GENIUS:
|
||||
return MODEL_GENIUS
|
||||
if mode == "deep" and MODEL_SMART:
|
||||
return MODEL_SMART
|
||||
return MODEL
|
||||
|
||||
|
||||
def _normalize_mode(mode: str) -> str:
|
||||
normalized = (mode or "").strip().lower()
|
||||
if normalized in {"quick", "fast"}:
|
||||
return "fast"
|
||||
if normalized in {"smart"}:
|
||||
return "smart"
|
||||
if normalized in {"genius", "deep"}:
|
||||
return "genius"
|
||||
return "smart"
|
||||
|
||||
|
||||
def _mode_time_budget_sec(mode: str) -> float:
|
||||
normalized = _normalize_mode(mode)
|
||||
if normalized == "fast":
|
||||
return max(1.0, QUICK_TIME_BUDGET_SEC)
|
||||
if normalized == "smart":
|
||||
return max(1.0, SMART_TIME_BUDGET_SEC)
|
||||
if normalized == "genius":
|
||||
return max(1.0, GENIUS_TIME_BUDGET_SEC)
|
||||
return max(1.0, SMART_TIME_BUDGET_SEC)
|
||||
|
||||
|
||||
def _mode_ollama_timeout_sec(mode: str) -> float:
|
||||
normalized = _normalize_mode(mode)
|
||||
budget = _mode_time_budget_sec(normalized)
|
||||
if normalized == "fast":
|
||||
return max(6.0, min(budget - 2.0, OLLAMA_TIMEOUT_SEC))
|
||||
if normalized == "smart":
|
||||
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
|
||||
if normalized == "genius":
|
||||
return max(20.0, min(budget - 10.0, OLLAMA_TIMEOUT_SEC))
|
||||
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
|
||||
|
||||
|
||||
def _mode_heartbeat_sec(mode: str) -> int:
|
||||
normalized = _normalize_mode(mode)
|
||||
budget = _mode_time_budget_sec(normalized)
|
||||
return max(5, min(THINKING_INTERVAL_SEC, int(max(5.0, budget / 3.0))))
|
||||
|
||||
|
||||
# Matrix HTTP helper.
|
||||
def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
|
||||
url = (base or BASE) + path
|
||||
@ -416,12 +503,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
|
||||
raw = resp.read()
|
||||
return json.loads(raw.decode()) if raw else {}
|
||||
|
||||
def login() -> str:
|
||||
login_user = normalize_user_id(USER)
|
||||
def login(user: str, password: str) -> str:
|
||||
login_user = normalize_user_id(user)
|
||||
payload = {
|
||||
"type": "m.login.password",
|
||||
"identifier": {"type": "m.id.user", "user": login_user},
|
||||
"password": PASSWORD,
|
||||
"password": password,
|
||||
}
|
||||
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
|
||||
return res["access_token"]
|
||||
@ -2628,6 +2715,11 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
|
||||
return combined
|
||||
|
||||
|
||||
def _merge_context_blocks(*blocks: str) -> str:
|
||||
parts = [block.strip() for block in blocks if isinstance(block, str) and block.strip()]
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
class ThoughtState:
|
||||
def __init__(self, total_steps: int = 0):
|
||||
self._lock = threading.Lock()
|
||||
@ -2985,6 +3077,7 @@ def _ollama_call_safe(
|
||||
fallback: str,
|
||||
system_override: str | None = None,
|
||||
model: str | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> str:
|
||||
try:
|
||||
return _ollama_call(
|
||||
@ -2994,6 +3087,7 @@ def _ollama_call_safe(
|
||||
use_history=False,
|
||||
system_override=system_override,
|
||||
model=model,
|
||||
timeout=timeout,
|
||||
)
|
||||
except Exception:
|
||||
return fallback
|
||||
@ -3813,9 +3907,12 @@ def _open_ended_multi(
|
||||
|
||||
|
||||
def _open_ended_total_steps(mode: str) -> int:
|
||||
if mode == "fast":
|
||||
normalized = _normalize_mode(mode)
|
||||
if normalized == "fast":
|
||||
return 2
|
||||
return 9
|
||||
if normalized == "smart":
|
||||
return 3
|
||||
return 4
|
||||
|
||||
|
||||
def _fast_fact_lines(
|
||||
@ -4136,6 +4233,7 @@ def _open_ended_fast_single(
|
||||
prompt: str,
|
||||
*,
|
||||
context: str,
|
||||
fallback_context: str | None = None,
|
||||
history_lines: list[str] | None = None,
|
||||
state: ThoughtState | None = None,
|
||||
model: str,
|
||||
@ -4143,24 +4241,26 @@ def _open_ended_fast_single(
|
||||
if state:
|
||||
state.update("drafting", step=1, note="summarizing")
|
||||
working_context = _append_history_context(context, history_lines or []) if history_lines else context
|
||||
reply = _ollama_call(
|
||||
reply = _ollama_call_safe(
|
||||
("atlasbot_fast", "atlasbot_fast"),
|
||||
prompt,
|
||||
context=working_context,
|
||||
use_history=False,
|
||||
fallback="",
|
||||
system_override=_open_ended_system(),
|
||||
model=model,
|
||||
timeout=_mode_ollama_timeout_sec("fast"),
|
||||
)
|
||||
if not _has_body_lines(reply):
|
||||
reply = _ollama_call(
|
||||
reply = _ollama_call_safe(
|
||||
("atlasbot_fast", "atlasbot_fast"),
|
||||
prompt + " Provide one clear sentence before the score lines.",
|
||||
context=working_context,
|
||||
use_history=False,
|
||||
fallback="",
|
||||
system_override=_open_ended_system(),
|
||||
model=model,
|
||||
timeout=_mode_ollama_timeout_sec("fast"),
|
||||
)
|
||||
fallback = _fallback_fact_answer(prompt, context)
|
||||
fallback = _fallback_fact_answer(prompt, fallback_context or context)
|
||||
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
|
||||
reply = fallback
|
||||
if not _has_body_lines(reply):
|
||||
@ -4177,6 +4277,7 @@ def _open_ended_fast(
|
||||
fact_lines: list[str],
|
||||
fact_meta: dict[str, dict[str, Any]],
|
||||
history_lines: list[str],
|
||||
extra_context: str = "",
|
||||
state: ThoughtState | None = None,
|
||||
) -> str:
|
||||
model = _model_for_mode("fast")
|
||||
@ -4197,6 +4298,7 @@ def _open_ended_fast(
|
||||
selected_pack = _fact_pack_text(selected_lines, selected_meta)
|
||||
if _needs_full_fact_pack(prompt) or not selected_lines:
|
||||
selected_pack = fact_pack
|
||||
model_context = _merge_context_blocks(selected_pack, extra_context)
|
||||
if not subjective and _needs_full_fact_pack(prompt):
|
||||
fallback = _fallback_fact_answer(prompt, fact_pack)
|
||||
if fallback:
|
||||
@ -4205,7 +4307,8 @@ def _open_ended_fast(
|
||||
state.total_steps = _open_ended_total_steps("fast")
|
||||
return _open_ended_fast_single(
|
||||
prompt,
|
||||
context=selected_pack,
|
||||
context=model_context,
|
||||
fallback_context=selected_pack,
|
||||
history_lines=history_lines,
|
||||
state=state,
|
||||
model=model,
|
||||
@ -4219,16 +4322,55 @@ def _open_ended_deep(
|
||||
fact_lines: list[str],
|
||||
fact_meta: dict[str, dict[str, Any]],
|
||||
history_lines: list[str],
|
||||
mode: str,
|
||||
extra_context: str = "",
|
||||
state: ThoughtState | None = None,
|
||||
) -> str:
|
||||
return _open_ended_multi(
|
||||
prompt,
|
||||
fact_pack=fact_pack,
|
||||
fact_lines=fact_lines,
|
||||
fact_meta=fact_meta,
|
||||
history_lines=history_lines,
|
||||
state=state,
|
||||
normalized = _normalize_mode(mode)
|
||||
model = _model_for_mode(normalized)
|
||||
subjective = _is_subjective_query(prompt)
|
||||
primary_tags = _primary_tags_for_prompt(prompt)
|
||||
focus_tags = _preferred_tags_for_prompt(prompt)
|
||||
if not focus_tags and subjective:
|
||||
focus_tags = set(_ALLOWED_INSIGHT_TAGS)
|
||||
avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
|
||||
limit = 12 if normalized == "smart" else 18
|
||||
selected_lines = _fast_fact_lines(
|
||||
fact_lines,
|
||||
fact_meta,
|
||||
focus_tags=focus_tags,
|
||||
avoid_tags=avoid_tags,
|
||||
primary_tags=primary_tags,
|
||||
limit=limit,
|
||||
)
|
||||
selected_meta = _fact_pack_meta(selected_lines)
|
||||
selected_pack = _fact_pack_text(selected_lines, selected_meta)
|
||||
if _needs_full_fact_pack(prompt) or not selected_lines or normalized == "genius":
|
||||
selected_pack = fact_pack
|
||||
fallback = _fallback_fact_answer(prompt, selected_pack)
|
||||
model_context = _merge_context_blocks(selected_pack, extra_context)
|
||||
if not subjective and fallback:
|
||||
if state:
|
||||
state.update("done", step=_open_ended_total_steps(normalized))
|
||||
return _ensure_scores(fallback)
|
||||
if state:
|
||||
state.update("drafting", step=1, note="synthesizing")
|
||||
reply = _ollama_call_safe(
|
||||
("atlasbot_deep", "atlasbot_deep"),
|
||||
prompt,
|
||||
context=_append_history_context(model_context, history_lines),
|
||||
fallback="",
|
||||
system_override=_open_ended_system(),
|
||||
model=model,
|
||||
timeout=_mode_ollama_timeout_sec(normalized),
|
||||
)
|
||||
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
|
||||
reply = fallback
|
||||
if not _has_body_lines(reply):
|
||||
reply = "I don't have enough data in the current snapshot to answer that."
|
||||
if state:
|
||||
state.update("done", step=_open_ended_total_steps(normalized))
|
||||
return _ensure_scores(reply)
|
||||
|
||||
|
||||
def open_ended_answer(
|
||||
@ -4240,6 +4382,7 @@ def open_ended_answer(
|
||||
history_lines: list[str],
|
||||
mode: str,
|
||||
allow_tools: bool,
|
||||
context: str = "",
|
||||
state: ThoughtState | None = None,
|
||||
) -> str:
|
||||
lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
|
||||
@ -4256,13 +4399,15 @@ def open_ended_answer(
|
||||
return _ensure_scores("I don't have enough data to answer that.")
|
||||
fact_meta = _fact_pack_meta(lines)
|
||||
fact_pack = _fact_pack_text(lines, fact_meta)
|
||||
if mode == "fast":
|
||||
normalized = _normalize_mode(mode)
|
||||
if normalized == "fast":
|
||||
return _open_ended_fast(
|
||||
prompt,
|
||||
fact_pack=fact_pack,
|
||||
fact_lines=lines,
|
||||
fact_meta=fact_meta,
|
||||
history_lines=history_lines,
|
||||
extra_context=context,
|
||||
state=state,
|
||||
)
|
||||
return _open_ended_deep(
|
||||
@ -4271,6 +4416,8 @@ def open_ended_answer(
|
||||
fact_lines=lines,
|
||||
fact_meta=fact_meta,
|
||||
history_lines=history_lines,
|
||||
extra_context=context,
|
||||
mode=normalized,
|
||||
state=state,
|
||||
)
|
||||
|
||||
@ -4292,6 +4439,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
|
||||
use_history=False,
|
||||
system_override=system,
|
||||
model=model,
|
||||
timeout=_mode_ollama_timeout_sec(mode),
|
||||
)
|
||||
reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
|
||||
return _ensure_scores(reply)
|
||||
@ -4343,13 +4491,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
|
||||
self._write_json(400, {"error": "missing_prompt"})
|
||||
return
|
||||
cleaned = _strip_bot_mention(prompt)
|
||||
mode = str(payload.get("mode") or "deep").lower()
|
||||
if mode in ("quick", "fast"):
|
||||
mode = "fast"
|
||||
elif mode in ("smart", "deep"):
|
||||
mode = "deep"
|
||||
else:
|
||||
mode = "deep"
|
||||
mode = _normalize_mode(str(payload.get("mode") or "smart"))
|
||||
snapshot = _snapshot_state()
|
||||
inventory = _snapshot_inventory(snapshot) or node_inventory_live()
|
||||
workloads = _snapshot_workloads(snapshot)
|
||||
@ -4386,6 +4528,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
|
||||
history_lines=history_lines,
|
||||
mode=mode,
|
||||
allow_tools=True,
|
||||
context=context,
|
||||
state=None,
|
||||
)
|
||||
else:
|
||||
@ -4640,6 +4783,7 @@ def _ollama_call(
|
||||
use_history: bool = True,
|
||||
system_override: str | None = None,
|
||||
model: str | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> str:
|
||||
system = system_override or (
|
||||
"System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
|
||||
@ -4673,6 +4817,7 @@ def _ollama_call(
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
model_name = model or MODEL
|
||||
request_timeout = timeout if timeout is not None else OLLAMA_TIMEOUT_SEC
|
||||
payload = {"model": model_name, "messages": messages, "stream": False}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if API_KEY:
|
||||
@ -4683,13 +4828,13 @@ def _ollama_call(
|
||||
lock.acquire()
|
||||
try:
|
||||
try:
|
||||
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
|
||||
with request.urlopen(r, timeout=request_timeout) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except error.HTTPError as exc:
|
||||
if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
|
||||
payload["model"] = FALLBACK_MODEL
|
||||
r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
|
||||
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
|
||||
with request.urlopen(r, timeout=request_timeout) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
else:
|
||||
raise
|
||||
@ -4714,6 +4859,7 @@ def ollama_reply(
|
||||
fallback: str = "",
|
||||
use_history: bool = True,
|
||||
model: str | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> str:
|
||||
last_error = None
|
||||
for attempt in range(max(1, OLLAMA_RETRIES + 1)):
|
||||
@ -4724,6 +4870,7 @@ def ollama_reply(
|
||||
context=context,
|
||||
use_history=use_history,
|
||||
model=model,
|
||||
timeout=timeout,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_error = exc
|
||||
@ -4744,11 +4891,13 @@ def ollama_reply_with_thinking(
|
||||
fallback: str,
|
||||
use_history: bool = True,
|
||||
model: str | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> str:
|
||||
result: dict[str, str] = {"reply": ""}
|
||||
done = threading.Event()
|
||||
|
||||
def worker():
|
||||
try:
|
||||
result["reply"] = ollama_reply(
|
||||
hist_key,
|
||||
prompt,
|
||||
@ -4756,7 +4905,9 @@ def ollama_reply_with_thinking(
|
||||
fallback=fallback,
|
||||
use_history=use_history,
|
||||
model=model,
|
||||
timeout=timeout,
|
||||
)
|
||||
finally:
|
||||
done.set()
|
||||
|
||||
thread = threading.Thread(target=worker, daemon=True)
|
||||
@ -4789,6 +4940,7 @@ def open_ended_with_thinking(
|
||||
history_lines: list[str],
|
||||
mode: str,
|
||||
allow_tools: bool,
|
||||
context: str = "",
|
||||
) -> str:
|
||||
result: dict[str, str] = {"reply": ""}
|
||||
done = threading.Event()
|
||||
@ -4796,6 +4948,7 @@ def open_ended_with_thinking(
|
||||
state = ThoughtState(total_steps=total_steps)
|
||||
|
||||
def worker():
|
||||
try:
|
||||
result["reply"] = open_ended_answer(
|
||||
prompt,
|
||||
inventory=inventory,
|
||||
@ -4804,15 +4957,17 @@ def open_ended_with_thinking(
|
||||
history_lines=history_lines,
|
||||
mode=mode,
|
||||
allow_tools=allow_tools,
|
||||
context=context,
|
||||
state=state,
|
||||
)
|
||||
finally:
|
||||
done.set()
|
||||
|
||||
thread = threading.Thread(target=worker, daemon=True)
|
||||
thread.start()
|
||||
if not done.wait(2.0):
|
||||
send_msg(token, room, "Thinking…")
|
||||
heartbeat = max(10, THINKING_INTERVAL_SEC)
|
||||
heartbeat = _mode_heartbeat_sec(mode)
|
||||
next_heartbeat = time.monotonic() + heartbeat
|
||||
while not done.wait(max(0, next_heartbeat - time.monotonic())):
|
||||
send_msg(token, room, state.status_line())
|
||||
@ -4820,7 +4975,7 @@ def open_ended_with_thinking(
|
||||
thread.join(timeout=1)
|
||||
return result["reply"] or "Model backend is busy. Try again in a moment."
|
||||
|
||||
def sync_loop(token: str, room_id: str):
|
||||
def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
|
||||
since = None
|
||||
try:
|
||||
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
|
||||
@ -4861,7 +5016,7 @@ def sync_loop(token: str, room_id: str):
|
||||
if not body:
|
||||
continue
|
||||
sender = ev.get("sender", "")
|
||||
if sender == f"@{USER}:live.bstein.dev":
|
||||
if account_user and sender == normalize_user_id(account_user):
|
||||
continue
|
||||
|
||||
mentioned = is_mentioned(content, body)
|
||||
@ -4874,7 +5029,12 @@ def sync_loop(token: str, room_id: str):
|
||||
|
||||
cleaned_body = _strip_bot_mention(body)
|
||||
lower_body = cleaned_body.lower()
|
||||
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
|
||||
mode = _detect_mode(
|
||||
content,
|
||||
body,
|
||||
default=_normalize_mode(default_mode),
|
||||
account_user=account_user,
|
||||
)
|
||||
|
||||
# Only do live cluster introspection in DMs.
|
||||
allow_tools = is_dm
|
||||
@ -4938,39 +5098,81 @@ def sync_loop(token: str, room_id: str):
|
||||
snapshot=snapshot,
|
||||
workloads=workloads,
|
||||
history_lines=history[hist_key],
|
||||
mode=mode if mode in ("fast", "deep") else "deep",
|
||||
mode=_normalize_mode(mode),
|
||||
allow_tools=allow_tools,
|
||||
context=context,
|
||||
)
|
||||
else:
|
||||
reply = _non_cluster_reply(
|
||||
cleaned_body,
|
||||
history_lines=history[hist_key],
|
||||
mode=mode if mode in ("fast", "deep") else "deep",
|
||||
mode=_normalize_mode(mode),
|
||||
)
|
||||
send_msg(token, rid, reply)
|
||||
history[hist_key].append(f"Atlas: {reply}")
|
||||
history[hist_key] = history[hist_key][-80:]
|
||||
|
||||
def login_with_retry():
|
||||
def login_with_retry(user: str, password: str):
|
||||
last_err = None
|
||||
for attempt in range(10):
|
||||
try:
|
||||
return login()
|
||||
return login(user, password)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_err = exc
|
||||
time.sleep(min(30, 2 ** attempt))
|
||||
raise last_err
|
||||
|
||||
def _bot_accounts() -> list[dict[str, str]]:
|
||||
accounts: list[dict[str, str]] = []
|
||||
|
||||
def add(user: str, password: str, mode: str):
|
||||
if not user or not password:
|
||||
return
|
||||
accounts.append({"user": user, "password": password, "mode": mode})
|
||||
|
||||
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "smart")
|
||||
if BOT_USER_QUICK and BOT_PASS_QUICK:
|
||||
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
|
||||
if BOT_USER_GENIUS and BOT_PASS_GENIUS:
|
||||
add(BOT_USER_GENIUS, BOT_PASS_GENIUS, "genius")
|
||||
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
|
||||
add(BOT_USER, BOT_PASS, "smart")
|
||||
|
||||
seen: set[str] = set()
|
||||
unique: list[dict[str, str]] = []
|
||||
for acc in accounts:
|
||||
uid = normalize_user_id(acc["user"]).lower()
|
||||
if uid in seen:
|
||||
continue
|
||||
seen.add(uid)
|
||||
unique.append(acc)
|
||||
return unique
|
||||
|
||||
def main():
|
||||
load_kb()
|
||||
_start_http_server()
|
||||
token = login_with_retry()
|
||||
accounts = _bot_accounts()
|
||||
threads: list[threading.Thread] = []
|
||||
for acc in accounts:
|
||||
token = login_with_retry(acc["user"], acc["password"])
|
||||
try:
|
||||
room_id = resolve_alias(token, ROOM_ALIAS)
|
||||
join_room(token, room_id)
|
||||
except Exception:
|
||||
room_id = None
|
||||
sync_loop(token, room_id)
|
||||
thread = threading.Thread(
|
||||
target=sync_loop,
|
||||
args=(token, room_id),
|
||||
kwargs={
|
||||
"account_user": acc["user"],
|
||||
"default_mode": acc["mode"],
|
||||
},
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -7,6 +7,14 @@ read_secret() {
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
}
|
||||
|
||||
read_optional() {
|
||||
if [ -f "${vault_dir}/$1" ]; then
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
else
|
||||
printf ''
|
||||
fi
|
||||
}
|
||||
|
||||
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
|
||||
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
|
||||
|
||||
@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
|
||||
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
|
||||
|
||||
export BOT_PASS="$(read_secret bot-pass)"
|
||||
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
|
||||
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
|
||||
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
|
||||
if [ -z "${BOT_PASS_SMART}" ]; then
|
||||
export BOT_PASS_SMART="${BOT_PASS}"
|
||||
fi
|
||||
if [ -z "${BOT_PASS_GENIUS}" ]; then
|
||||
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
|
||||
fi
|
||||
export SEEDER_PASS="$(read_secret seeder-pass)"
|
||||
|
||||
export CHAT_API_KEY="$(read_secret chat-matrix)"
|
||||
|
||||
164
services/comms/scripts/tests/test_atlasbot_modes.py
Normal file
164
services/comms/scripts/tests/test_atlasbot_modes.py
Normal file
@ -0,0 +1,164 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest import TestCase, mock
|
||||
|
||||
|
||||
BOT_PATH = Path(__file__).resolve().parents[1] / "atlasbot" / "bot.py"
|
||||
|
||||
|
||||
def load_bot_module():
|
||||
env = {
|
||||
"BOT_USER": "atlas-smart",
|
||||
"BOT_PASS": "smart-pass",
|
||||
"BOT_USER_QUICK": "atlas-quick",
|
||||
"BOT_PASS_QUICK": "quick-pass",
|
||||
"BOT_USER_SMART": "atlas-smart",
|
||||
"BOT_PASS_SMART": "smart-pass",
|
||||
"BOT_USER_GENIUS": "atlas-genius",
|
||||
"BOT_PASS_GENIUS": "genius-pass",
|
||||
"OLLAMA_URL": "http://ollama.invalid",
|
||||
"OLLAMA_MODEL": "base-model",
|
||||
"ATLASBOT_MODEL_FAST": "fast-model",
|
||||
"ATLASBOT_MODEL_SMART": "smart-model",
|
||||
"ATLASBOT_MODEL_GENIUS": "genius-model",
|
||||
"ATLASBOT_QUICK_TIME_BUDGET_SEC": "15",
|
||||
"ATLASBOT_SMART_TIME_BUDGET_SEC": "45",
|
||||
"ATLASBOT_GENIUS_TIME_BUDGET_SEC": "180",
|
||||
"KB_DIR": "",
|
||||
"VM_URL": "http://vm.invalid",
|
||||
"ARIADNE_STATE_URL": "",
|
||||
"ARIADNE_STATE_TOKEN": "",
|
||||
}
|
||||
with mock.patch.dict(os.environ, env, clear=False):
|
||||
spec = importlib.util.spec_from_file_location("atlasbot_bot", BOT_PATH)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
assert spec.loader is not None
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
class AtlasbotModeTests(TestCase):
|
||||
def setUp(self):
|
||||
self.bot = load_bot_module()
|
||||
|
||||
def test_bot_accounts_include_genius_mode(self):
|
||||
accounts = self.bot._bot_accounts()
|
||||
by_user = {account["user"]: account["mode"] for account in accounts}
|
||||
|
||||
self.assertEqual(by_user["atlas-quick"], "fast")
|
||||
self.assertEqual(by_user["atlas-smart"], "smart")
|
||||
self.assertEqual(by_user["atlas-genius"], "genius")
|
||||
|
||||
def test_objective_cluster_question_uses_fact_pack_without_llm(self):
|
||||
fact_lines = [
|
||||
"hottest_cpu: longhorn-system (6.69)",
|
||||
"hottest_ram: longhorn-system (36.05 GB)",
|
||||
]
|
||||
|
||||
with (
|
||||
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
|
||||
mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
|
||||
):
|
||||
reply = self.bot.open_ended_answer(
|
||||
"what is the hottest cpu node in titan lab currently?",
|
||||
inventory=[],
|
||||
snapshot=None,
|
||||
workloads=[],
|
||||
history_lines=[],
|
||||
mode="smart",
|
||||
allow_tools=True,
|
||||
)
|
||||
|
||||
self.assertIn("longhorn-system", reply)
|
||||
self.assertIn("Confidence:", reply)
|
||||
|
||||
def test_subjective_genius_answer_uses_genius_model(self):
|
||||
fact_lines = [
|
||||
"hottest_cpu: longhorn-system (6.69)",
|
||||
"worker_nodes: titan-01, titan-02, titan-03",
|
||||
]
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
|
||||
captured["model"] = model
|
||||
captured["timeout"] = timeout
|
||||
captured["context"] = context
|
||||
return "The worker spread stands out because Titan keeps meaningful capacity on the same cluster. Confidence: high"
|
||||
|
||||
with (
|
||||
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
|
||||
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
|
||||
):
|
||||
reply = self.bot.open_ended_answer(
|
||||
"what stands out about titan lab?",
|
||||
inventory=[],
|
||||
snapshot=None,
|
||||
workloads=[],
|
||||
history_lines=[],
|
||||
mode="genius",
|
||||
allow_tools=True,
|
||||
context='Cluster snapshot (JSON): {"injected":true}',
|
||||
)
|
||||
|
||||
self.assertIn("The worker spread stands out", reply)
|
||||
self.assertEqual(captured["model"], "genius-model")
|
||||
self.assertLessEqual(float(captured["timeout"]), 180.0)
|
||||
self.assertIn('Cluster snapshot (JSON): {"injected":true}', str(captured["context"]))
|
||||
|
||||
def test_mode_timeouts_stay_within_budgets(self):
|
||||
fact_lines = [
|
||||
"hottest_cpu: longhorn-system (6.69)",
|
||||
"worker_nodes: titan-01, titan-02, titan-03",
|
||||
]
|
||||
seen: list[tuple[str, float]] = []
|
||||
|
||||
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
|
||||
seen.append((str(model), float(timeout or 0)))
|
||||
return "Atlas has a clear standout because the worker spread is healthy. Confidence: high"
|
||||
|
||||
with (
|
||||
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
|
||||
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
|
||||
):
|
||||
for mode in ("fast", "smart", "genius"):
|
||||
reply = self.bot.open_ended_answer(
|
||||
"what stands out about titan lab?",
|
||||
inventory=[],
|
||||
snapshot=None,
|
||||
workloads=[],
|
||||
history_lines=[],
|
||||
mode=mode,
|
||||
allow_tools=True,
|
||||
)
|
||||
self.assertIn("Confidence:", reply)
|
||||
|
||||
self.assertEqual([model for model, _ in seen], ["fast-model", "smart-model", "genius-model"])
|
||||
self.assertLessEqual(seen[0][1], 15.0)
|
||||
self.assertLessEqual(seen[1][1], 45.0)
|
||||
self.assertLessEqual(seen[2][1], 180.0)
|
||||
|
||||
def test_llm_timeout_still_returns_a_conclusion(self):
|
||||
fact_lines = [
|
||||
"worker_nodes: titan-01, titan-02, titan-03",
|
||||
"hottest_cpu: longhorn-system (6.69)",
|
||||
]
|
||||
|
||||
with (
|
||||
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
|
||||
mock.patch.object(self.bot, "_ollama_call", side_effect=TimeoutError("simulated timeout")),
|
||||
):
|
||||
reply = self.bot.open_ended_answer(
|
||||
"what stands out about the worker nodes?",
|
||||
inventory=[],
|
||||
snapshot=None,
|
||||
workloads=[],
|
||||
history_lines=[],
|
||||
mode="genius",
|
||||
allow_tools=True,
|
||||
)
|
||||
|
||||
self.assertIn("worker nodes", reply.lower())
|
||||
self.assertIn("Confidence:", reply)
|
||||
@ -66,7 +66,7 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
|
||||
@ -29,12 +29,18 @@ spec:
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 50
|
||||
- weight: 80
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
values: ["rpi5"]
|
||||
- weight: 60
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
|
||||
containers:
|
||||
- name: monerod
|
||||
image: registry.bstein.dev/crypto/monerod:0.18.4.1
|
||||
|
||||
@ -23,7 +23,7 @@ spec:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
values: ["rpi5"]
|
||||
containers:
|
||||
- name: xmrig
|
||||
image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
|
||||
|
||||
@ -123,13 +123,22 @@ spec:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
- key: longhorn
|
||||
operator: NotIn
|
||||
values: ["true"]
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values: ["titan-13","titan-15","titan-17","titan-19"]
|
||||
- weight: 50
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
values: ["rpi5"]
|
||||
containers:
|
||||
- name: gitea
|
||||
image: gitea/gitea:1.23
|
||||
|
||||
@ -245,6 +245,17 @@ spec:
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-registry
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
|
||||
extraEnvVars:
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
|
||||
value: harbor-core
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
|
||||
value: http://harbor-registry:8080/service/notifications
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
|
||||
value: 5s
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
|
||||
value: "5"
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
|
||||
value: 1s
|
||||
controller:
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-registryctl
|
||||
@ -263,6 +274,10 @@ spec:
|
||||
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
|
||||
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
|
||||
{{ end }}
|
||||
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
|
||||
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
|
||||
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
|
||||
{{ end }}
|
||||
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
|
||||
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
|
||||
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
|
||||
@ -397,10 +412,10 @@ spec:
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/rules/0/http/paths/2/backend/service/name
|
||||
value: harbor-registry
|
||||
value: harbor-core
|
||||
- op: replace
|
||||
path: /spec/rules/0/http/paths/2/backend/service/port/number
|
||||
value: 5000
|
||||
value: 80
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: harbor-jobservice
|
||||
@ -422,8 +437,7 @@ spec:
|
||||
- $patch: replace
|
||||
- name: VAULT_ENV_FILE
|
||||
value: /vault/secrets/harbor-jobservice-env.sh
|
||||
envFrom:
|
||||
- $patch: replace
|
||||
envFrom: []
|
||||
- configMapRef:
|
||||
name: harbor-jobservice-env
|
||||
volumeMounts:
|
||||
@ -464,6 +478,16 @@ spec:
|
||||
value: /vault/secrets/harbor-registry-env.sh
|
||||
- name: VAULT_COPY_FILES
|
||||
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
|
||||
value: harbor-core
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
|
||||
value: http://harbor-registry:8080/service/notifications
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
|
||||
value: 5s
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
|
||||
value: "5"
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
|
||||
value: 1s
|
||||
envFrom:
|
||||
- $patch: replace
|
||||
volumeMounts:
|
||||
|
||||
@ -67,7 +67,7 @@ data:
|
||||
url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -108,7 +108,7 @@ data:
|
||||
url('https://scm.bstein.dev/bstein/ci-demo.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
@ -167,6 +167,110 @@ data:
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('metis') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/2 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/metis.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('metis') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/5 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/metis.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('atlasbot') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/2 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/atlasbot.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('Soteria') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/5 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/soteria.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('data-prepper') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
|
||||
@ -48,7 +48,7 @@ spec:
|
||||
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
|
||||
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
|
||||
{{ end }}
|
||||
bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
|
||||
bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
|
||||
spec:
|
||||
serviceAccountName: jenkins
|
||||
nodeSelector:
|
||||
|
||||
13
services/jenkins/dind-pvc.yaml
Normal file
13
services/jenkins/dind-pvc.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# services/jenkins/dind-pvc.yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: jenkins-dind-cache
|
||||
namespace: jenkins
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
storageClassName: astreae
|
||||
@ -8,6 +8,7 @@ resources:
|
||||
- vault-serviceaccount.yaml
|
||||
- pvc.yaml
|
||||
- cache-pvc.yaml
|
||||
- dind-pvc.yaml
|
||||
- plugins-pvc.yaml
|
||||
- configmap-jcasc.yaml
|
||||
- configmap-plugins.yaml
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
|
||||
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
|
||||
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
|
||||
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
|
||||
# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: keycloak-portal-e2e-execute-actions-email-14
|
||||
name: keycloak-portal-e2e-execute-actions-email-18
|
||||
namespace: sso
|
||||
spec:
|
||||
suspend: true
|
||||
@ -70,7 +70,7 @@ spec:
|
||||
- name: E2E_PROBE_USERNAME
|
||||
value: robotuser
|
||||
- name: E2E_PROBE_EMAIL
|
||||
value: robotuser@bstein.dev
|
||||
value: brad.stein+robot@gmail.com
|
||||
- name: EXECUTE_ACTIONS_CLIENT_ID
|
||||
value: bstein-dev-home
|
||||
- name: EXECUTE_ACTIONS_REDIRECT_URI
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/keycloak/oneoffs/realm-settings-job.yaml
|
||||
# One-off job for sso/keycloak-realm-settings-36.
|
||||
# Purpose: keycloak realm settings 36 (see container args/env in this file).
|
||||
# One-off job for sso/keycloak-realm-settings-38.
|
||||
# Purpose: keycloak realm settings 38 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: keycloak-realm-settings-36
|
||||
name: keycloak-realm-settings-38
|
||||
namespace: sso
|
||||
spec:
|
||||
suspend: true
|
||||
@ -64,7 +64,7 @@ spec:
|
||||
- name: KEYCLOAK_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_SMTP_HOST
|
||||
value: mail.bstein.dev
|
||||
value: smtp.postmarkapp.com
|
||||
- name: KEYCLOAK_SMTP_PORT
|
||||
value: "587"
|
||||
- name: KEYCLOAK_SMTP_FROM
|
||||
|
||||
@ -18,6 +18,7 @@ spec:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
maintenance.bstein.dev/restart-rev: "20260207-2"
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "maintenance"
|
||||
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
|
||||
@ -105,7 +106,7 @@ spec:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: ariadne
|
||||
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
|
||||
image: registry.bstein.dev/bstein/ariadne:latest
|
||||
imagePullPolicy: Always
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
@ -285,7 +286,7 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_MAILU_SYNC
|
||||
value: "30 4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
||||
@ -293,23 +294,23 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
||||
value: "15 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
|
||||
value: "0 6 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
|
||||
value: "0 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_POD_CLEANER
|
||||
value: "0 * * * *"
|
||||
value: "*/30 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
|
||||
value: "23 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
||||
value: "30 4 * * 0"
|
||||
value: "0 */4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
||||
@ -319,9 +320,9 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
|
||||
value: "*/10 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_CLUSTER_STATE
|
||||
value: "*/15 * * * *"
|
||||
value: "*/10 * * * *"
|
||||
- name: ARIADNE_CLUSTER_STATE_KEEP
|
||||
value: "168"
|
||||
value: "720"
|
||||
- name: WELCOME_EMAIL_ENABLED
|
||||
value: "true"
|
||||
- name: K8S_API_TIMEOUT_SEC
|
||||
@ -330,12 +331,20 @@ spec:
|
||||
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
|
||||
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
|
||||
value: "5"
|
||||
- name: ARIADNE_ALERTMANAGER_URL
|
||||
value: http://alertmanager.monitoring.svc.cluster.local
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
- name: OPENSEARCH_LIMIT_BYTES
|
||||
value: "1099511627776"
|
||||
- name: OPENSEARCH_INDEX_PATTERNS
|
||||
value: kube-*,journald-*,trace-analytics-*
|
||||
- name: METIS_BASE_URL
|
||||
value: http://metis.maintenance.svc.cluster.local
|
||||
- name: METIS_TIMEOUT_SEC
|
||||
value: "15"
|
||||
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
|
||||
value: "*/30 * * * *"
|
||||
- name: METRICS_PATH
|
||||
value: "/metrics"
|
||||
resources:
|
||||
|
||||
@ -29,6 +29,29 @@ rules:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- deployments
|
||||
- statefulsets
|
||||
- daemonsets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: ["longhorn.io"]
|
||||
resources:
|
||||
- volumes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods/exec
|
||||
@ -56,3 +79,17 @@ roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: ariadne-job-spawner
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: ariadne-auth-delegator
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ariadne
|
||||
namespace: maintenance
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:auth-delegator
|
||||
|
||||
@ -21,3 +21,72 @@ spec:
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/metis
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: metis
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: metis-sentinel
|
||||
namespace: maintenance
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/metis-sentinel
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: metis-sentinel
|
||||
namespace: maintenance
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: metis-sentinel
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/soteria
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: soteria
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
|
||||
@ -5,28 +5,50 @@ resources:
|
||||
- namespace.yaml
|
||||
- image.yaml
|
||||
- secretproviderclass.yaml
|
||||
- soteria-configmap.yaml
|
||||
- metis-configmap.yaml
|
||||
- metis-data-pvc.yaml
|
||||
- vault-serviceaccount.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- ariadne-serviceaccount.yaml
|
||||
- ariadne-rbac.yaml
|
||||
- disable-k3s-traefik-serviceaccount.yaml
|
||||
- k3s-traefik-cleanup-rbac.yaml
|
||||
- metis-serviceaccount.yaml
|
||||
- metis-rbac.yaml
|
||||
- metis-token-sync-serviceaccount.yaml
|
||||
- metis-token-sync-rbac.yaml
|
||||
- node-nofile-serviceaccount.yaml
|
||||
- pod-cleaner-rbac.yaml
|
||||
- soteria-serviceaccount.yaml
|
||||
- soteria-rbac.yaml
|
||||
- ariadne-deployment.yaml
|
||||
- metis-deployment.yaml
|
||||
- oneoffs/ariadne-migrate-job.yaml
|
||||
- ariadne-service.yaml
|
||||
- soteria-deployment.yaml
|
||||
- disable-k3s-traefik-daemonset.yaml
|
||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||
- node-nofile-daemonset.yaml
|
||||
- metis-sentinel-daemonset.yaml
|
||||
- metis-k3s-token-sync-cronjob.yaml
|
||||
- k3s-agent-restart-daemonset.yaml
|
||||
- pod-cleaner-cronjob.yaml
|
||||
- node-image-sweeper-serviceaccount.yaml
|
||||
- node-image-sweeper-daemonset.yaml
|
||||
- image-sweeper-cronjob.yaml
|
||||
- metis-service.yaml
|
||||
- metis-ingress.yaml
|
||||
- soteria-service.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/ariadne
|
||||
newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||
- name: registry.bstein.dev/bstein/metis
|
||||
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis:tag"}
|
||||
- name: registry.bstein.dev/bstein/metis-sentinel
|
||||
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis-sentinel:tag"}
|
||||
- name: registry.bstein.dev/bstein/soteria
|
||||
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
|
||||
configMapGenerator:
|
||||
- name: disable-k3s-traefik-script
|
||||
namespace: maintenance
|
||||
|
||||
20
services/maintenance/metis-configmap.yaml
Normal file
20
services/maintenance/metis-configmap.yaml
Normal file
@ -0,0 +1,20 @@
|
||||
# services/maintenance/metis-configmap.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
data:
|
||||
METIS_BIND_ADDR: :8080
|
||||
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
|
||||
METIS_DATA_DIR: /var/lib/metis
|
||||
METIS_DEFAULT_FLASH_HOST: titan-22
|
||||
METIS_FLASH_HOSTS: titan-22
|
||||
METIS_LOCAL_HOST: titan-22
|
||||
METIS_ALLOWED_GROUPS: admin,maintainer
|
||||
METIS_MAX_DEVICE_BYTES: "300000000000"
|
||||
METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
|
||||
METIS_SENTINEL_INTERVAL_SEC: "1800"
|
||||
METIS_SENTINEL_NSENTER: "1"
|
||||
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz
|
||||
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46
|
||||
13
services/maintenance/metis-data-pvc.yaml
Normal file
13
services/maintenance/metis-data-pvc.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# services/maintenance/metis-data-pvc.yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: metis-data
|
||||
namespace: maintenance
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 40Gi
|
||||
storageClassName: local-path
|
||||
47
services/maintenance/metis-deployment.yaml
Normal file
47
services/maintenance/metis-deployment.yaml
Normal file
@ -0,0 +1,47 @@
|
||||
# services/maintenance/metis-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: metis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: metis
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
serviceAccountName: metis
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: titan-22
|
||||
kubernetes.io/arch: amd64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: metis
|
||||
image: registry.bstein.dev/bstein/metis:latest
|
||||
imagePullPolicy: Always
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: metis
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
27
services/maintenance/metis-ingress.yaml
Normal file
27
services/maintenance/metis-ingress.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# services/maintenance/metis-ingress.yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: traefik
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||
traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-forward-auth@kubernetescrd
|
||||
spec:
|
||||
tls:
|
||||
- hosts: ["metis.bstein.dev"]
|
||||
secretName: metis-tls
|
||||
rules:
|
||||
- host: metis.bstein.dev
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: metis
|
||||
port:
|
||||
number: 80
|
||||
51
services/maintenance/metis-k3s-token-sync-cronjob.yaml
Normal file
51
services/maintenance/metis-k3s-token-sync-cronjob.yaml
Normal file
@ -0,0 +1,51 @@
|
||||
# services/maintenance/metis-k3s-token-sync-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: metis-k3s-token-sync
|
||||
namespace: maintenance
|
||||
spec:
|
||||
schedule: "11 */6 * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 2
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: metis-token-sync
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/control-plane: "true"
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: sync
|
||||
image: registry.bstein.dev/bstein/kubectl:1.35.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
set -euo pipefail
|
||||
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)"
|
||||
kubectl -n maintenance create secret generic metis-runtime \
|
||||
--from-literal=k3s_token="${token}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: k3s-server
|
||||
mountPath: /host/var/lib/rancher/k3s/server
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: k3s-server
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/server
|
||||
27
services/maintenance/metis-rbac.yaml
Normal file
27
services/maintenance/metis-rbac.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# services/maintenance/metis-rbac.yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: metis-node-manager
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- delete
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: metis-node-manager
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: metis-node-manager
|
||||
133
services/maintenance/metis-sentinel-daemonset.yaml
Normal file
133
services/maintenance/metis-sentinel-daemonset.yaml
Normal file
@ -0,0 +1,133 @@
|
||||
# services/maintenance/metis-sentinel-daemonset.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: metis-sentinel
|
||||
namespace: maintenance
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: metis-sentinel
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: metis-sentinel
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
serviceAccountName: metis
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: metis-sentinel
|
||||
image: registry.bstein.dev/bstein/metis-sentinel:latest
|
||||
imagePullPolicy: Always
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
|
||||
interval="${METIS_SENTINEL_INTERVAL_SEC:-120}"
|
||||
mkdir -p "${out_dir}"
|
||||
while true; do
|
||||
ts="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
node="${METIS_SENTINEL_NODE:-unknown}"
|
||||
tmp="${out_dir}/${node}-${ts}.json.tmp"
|
||||
out="${out_dir}/${node}-${ts}.json"
|
||||
if metis-sentinel > "${tmp}"; then
|
||||
mv "${tmp}" "${out}"
|
||||
else
|
||||
rm -f "${tmp}" || true
|
||||
fi
|
||||
sleep "${interval}"
|
||||
done
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: metis
|
||||
env:
|
||||
- name: METIS_SENTINEL_NODE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
volumeMounts:
|
||||
- name: sentinel-output
|
||||
mountPath: /var/run/metis-sentinel
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
runAsUser: 0
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
- name: sentinel-pusher
|
||||
image: curlimages/curl:8.12.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
|
||||
push_url="${METIS_SENTINEL_PUSH_URL:-}"
|
||||
interval="${METIS_SENTINEL_PUSH_INTERVAL_SEC:-120}"
|
||||
timeout="${METIS_SENTINEL_PUSH_TIMEOUT_SEC:-10}"
|
||||
mkdir -p "${out_dir}"
|
||||
while true; do
|
||||
for snapshot in "${out_dir}"/*.json; do
|
||||
[ -f "${snapshot}" ] || continue
|
||||
if [ -z "${push_url}" ]; then
|
||||
break
|
||||
fi
|
||||
if curl -fsS --connect-timeout "${timeout}" --max-time "${timeout}" \
|
||||
-X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Metis-Node: ${METIS_SENTINEL_NODE:-unknown}" \
|
||||
--data-binary "@${snapshot}" \
|
||||
"${push_url}"; then
|
||||
rm -f "${snapshot}"
|
||||
fi
|
||||
done
|
||||
sleep "${interval}"
|
||||
done
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: metis
|
||||
env:
|
||||
- name: METIS_SENTINEL_NODE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: sentinel-output
|
||||
mountPath: /var/run/metis-sentinel
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
runAsUser: 0
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumes:
|
||||
- name: sentinel-output
|
||||
emptyDir: {}
|
||||
18
services/maintenance/metis-service.yaml
Normal file
18
services/maintenance/metis-service.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
# services/maintenance/metis-service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "80"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: metis
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
6
services/maintenance/metis-serviceaccount.yaml
Normal file
6
services/maintenance/metis-serviceaccount.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
# services/maintenance/metis-serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: metis
|
||||
namespace: maintenance
|
||||
30
services/maintenance/metis-token-sync-rbac.yaml
Normal file
30
services/maintenance/metis-token-sync-rbac.yaml
Normal file
@ -0,0 +1,30 @@
|
||||
# services/maintenance/metis-token-sync-rbac.yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: metis-token-sync
|
||||
namespace: maintenance
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: metis-token-sync
|
||||
namespace: maintenance
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: metis-token-sync
|
||||
namespace: maintenance
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: metis-token-sync
|
||||
@ -0,0 +1,6 @@
|
||||
# services/maintenance/metis-token-sync-serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: metis-token-sync
|
||||
namespace: maintenance
|
||||
@ -10,6 +10,8 @@ spec:
|
||||
app: node-image-sweeper
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 100%
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@ -29,6 +31,21 @@ spec:
|
||||
- name: node-image-sweeper
|
||||
image: python:3.12.9-alpine3.20
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: SWEEP_INTERVAL_SEC
|
||||
value: "21600"
|
||||
- name: HIGH_USAGE_PERCENT
|
||||
value: "70"
|
||||
- name: EMERGENCY_USAGE_PERCENT
|
||||
value: "80"
|
||||
- name: BASE_THRESHOLD_DAYS
|
||||
value: "14"
|
||||
- name: HIGH_USAGE_THRESHOLD_DAYS
|
||||
value: "3"
|
||||
- name: LOG_RETENTION_DAYS
|
||||
value: "7"
|
||||
- name: JOURNAL_MAX_SIZE
|
||||
value: "200M"
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
|
||||
@ -2,26 +2,39 @@
|
||||
set -eu
|
||||
|
||||
ONE_SHOT=${ONE_SHOT:-false}
|
||||
THRESHOLD_DAYS=14
|
||||
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
||||
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
|
||||
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
|
||||
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
||||
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
||||
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
||||
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
|
||||
THRESHOLD_DAYS=3
|
||||
fi
|
||||
sweep_once() {
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
threshold_days="${BASE_THRESHOLD_DAYS}"
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
||||
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
|
||||
fi
|
||||
|
||||
cutoff=$(python3 - <<'PY'
|
||||
import time, os
|
||||
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
|
||||
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
|
||||
import os
|
||||
import time
|
||||
|
||||
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
|
||||
print(int(time.time()) - days * 86400)
|
||||
PY
|
||||
)
|
||||
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json, os, sys, time
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
@ -74,19 +87,33 @@ for p in prune:
|
||||
PY
|
||||
)
|
||||
|
||||
if [ -n "${prune_list}" ]; then
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
||||
# Emergency pass for rootfs pressure on SD-backed nodes.
|
||||
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
||||
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
||||
fi
|
||||
}
|
||||
|
||||
sweep_once
|
||||
|
||||
if [ "${ONE_SHOT}" = "true" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep infinity
|
||||
while true; do
|
||||
sleep "${SWEEP_INTERVAL_SEC}"
|
||||
sweep_once
|
||||
done
|
||||
|
||||
10
services/maintenance/soteria-configmap.yaml
Normal file
10
services/maintenance/soteria-configmap.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# services/maintenance/soteria-configmap.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
data:
|
||||
SOTERIA_BACKUP_DRIVER: "longhorn"
|
||||
SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
|
||||
SOTERIA_LONGHORN_BACKUP_MODE: "incremental"
|
||||
73
services/maintenance/soteria-deployment.yaml
Normal file
73
services/maintenance/soteria-deployment.yaml
Normal file
@ -0,0 +1,73 @@
|
||||
# services/maintenance/soteria-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: soteria
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: soteria
|
||||
spec:
|
||||
serviceAccountName: soteria
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 90
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi5"]
|
||||
- weight: 50
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
containers:
|
||||
- name: soteria
|
||||
image: registry.bstein.dev/bstein/soteria:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: soteria
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 2
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: http
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 2
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
22
services/maintenance/soteria-rbac.yaml
Normal file
22
services/maintenance/soteria-rbac.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
# services/maintenance/soteria-rbac.yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: soteria
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumeclaims", "persistentvolumes"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: soteria
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: soteria
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
14
services/maintenance/soteria-service.yaml
Normal file
14
services/maintenance/soteria-service.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
# services/maintenance/soteria-service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: soteria
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
8
services/maintenance/soteria-serviceaccount.yaml
Normal file
8
services/maintenance/soteria-serviceaccount.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
# services/maintenance/soteria-serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
imagePullSecrets:
|
||||
- name: harbor-regcred
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -89,7 +89,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -1125,7 +1125,7 @@
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "Ariadne CI Coverage (%)",
|
||||
"title": "Platform CI Coverage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1138,7 +1138,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{branch}}",
|
||||
"instant": true
|
||||
@ -1183,12 +1183,13 @@
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
"description": "Internal source panel for Atlas Overview automation test rollups."
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Ariadne CI Tests (latest)",
|
||||
"title": "Platform CI Tests (latest)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1201,7 +1202,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1233,7 +1234,8 @@
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -1677,7 +1677,7 @@
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"title": "Platform Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1690,7 +1690,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1709,12 +1709,13 @@
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
|
||||
},
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"title": "Platform Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1727,7 +1728,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
@ -1814,7 +1815,8 @@
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
@ -1901,7 +1903,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -22,7 +22,24 @@ data:
|
||||
- orgId: 1
|
||||
receiver: email-admins
|
||||
group_by:
|
||||
- grafana_folder
|
||||
- alertname
|
||||
group_wait: 1m
|
||||
group_interval: 30m
|
||||
repeat_interval: 12h
|
||||
routes:
|
||||
- receiver: email-admins
|
||||
object_matchers:
|
||||
- [severity, "=", "critical"]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 2h
|
||||
- receiver: email-admins
|
||||
object_matchers:
|
||||
- [severity, "=", "warning"]
|
||||
group_wait: 5m
|
||||
group_interval: 2h
|
||||
repeat_interval: 24h
|
||||
rules.yaml: |
|
||||
apiVersion: 1
|
||||
groups:
|
||||
@ -32,7 +49,7 @@ data:
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: disk-pressure-root
|
||||
title: "Node rootfs high (>80%)"
|
||||
title: "Node rootfs high (>85%)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
@ -66,7 +83,7 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [80]
|
||||
params: [85]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -76,7 +93,7 @@ data:
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "{{ $labels.node }} rootfs >80% for 10m"
|
||||
summary: "{{ $labels.node }} rootfs >85% for 10m"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: disk-growth-1h
|
||||
@ -145,7 +162,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||
legendFormat: '{{instance}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -286,8 +303,8 @@ data:
|
||||
summary: "node-image-sweeper not fully ready"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (>3h since success)"
|
||||
- uid: maint-ariadne-image-sweeper-stale
|
||||
title: "Ariadne image sweeper stale (schedule >8d)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
@ -297,10 +314,10 @@ data:
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
||||
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{cronjob}}'
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
@ -321,17 +338,166 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [10800]
|
||||
params: [691200]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Maintenance cronjob stale >3h since last success"
|
||||
summary: "Ariadne image sweeper stale >8d since last success"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (legacy disabled)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: vector(0)
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: legacy
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: "Legacy cronjob alert disabled"
|
||||
labels:
|
||||
severity: info
|
||||
- orgId: 1
|
||||
name: ariadne
|
||||
folder: Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: ariadne-schedule-error
|
||||
title: "Ariadne schedule task failed"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: ariadne-scheduler-stalled
|
||||
title: "Ariadne scheduler behind (>15m)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [900]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne scheduler behind for {{ $labels.task }}"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
@ -352,7 +518,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
|
||||
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
|
||||
legendFormat: bounce 1d
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -381,7 +547,7 @@ data:
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Postmark 1d bounce rate >5%"
|
||||
@ -400,7 +566,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: POSTMARK_API_UP
|
||||
expr: max(postmark_api_up) or on() vector(0)
|
||||
legendFormat: api up
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -429,7 +595,7 @@ data:
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Postmark exporter reports API down"
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -98,7 +98,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -1134,7 +1134,7 @@ data:
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "Ariadne CI Coverage (%)",
|
||||
"title": "Platform CI Coverage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1147,7 +1147,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{branch}}",
|
||||
"instant": true
|
||||
@ -1192,12 +1192,13 @@ data:
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
"description": "Internal source panel for Atlas Overview automation test rollups."
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Ariadne CI Tests (latest)",
|
||||
"title": "Platform CI Tests (latest)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1210,7 +1211,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1242,7 +1243,8 @@ data:
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -1686,7 +1686,7 @@ data:
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"title": "Platform Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1699,7 +1699,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1718,12 +1718,13 @@ data:
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
|
||||
},
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"title": "Platform Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1736,7 +1737,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
@ -1823,7 +1824,8 @@ data:
|
||||
"order": "desc"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
@ -1910,7 +1912,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -286,7 +286,7 @@ spec:
|
||||
podAnnotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "monitoring"
|
||||
monitoring.bstein.dev/restart-rev: "1"
|
||||
monitoring.bstein.dev/restart-rev: "6"
|
||||
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
|
||||
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
|
||||
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
|
||||
|
||||
@ -43,6 +43,12 @@ spec:
|
||||
value: /var/run/secrets/vault-token-reviewer/token
|
||||
- name: VAULT_K8S_ROLE_TTL
|
||||
value: 1h
|
||||
- name: VAULT_K8S_BOUND_AUDIENCES
|
||||
value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
|
||||
- name: VAULT_K8S_ISSUER
|
||||
value: https://kubernetes.default.svc.cluster.local
|
||||
- name: VAULT_K8S_DISABLE_ISS_VALIDATION
|
||||
value: "false"
|
||||
volumeMounts:
|
||||
- name: k8s-auth-config-script
|
||||
mountPath: /scripts
|
||||
|
||||
@ -53,6 +53,8 @@ ensure_token
|
||||
k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
|
||||
k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
|
||||
k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
||||
k8s_issuer="${VAULT_K8S_ISSUER:-}"
|
||||
disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
|
||||
role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
|
||||
token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
|
||||
|
||||
@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
|
||||
vault_cmd auth enable kubernetes
|
||||
fi
|
||||
|
||||
ensure_default_policy_login() {
|
||||
default_policy="$(vault_cmd policy read default)"
|
||||
if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
|
||||
return
|
||||
fi
|
||||
log "updating default policy to allow kubernetes login"
|
||||
default_policy="${default_policy}
|
||||
path \"auth/kubernetes/login\" {
|
||||
capabilities = [\"create\", \"update\"]
|
||||
}
|
||||
"
|
||||
printf '%s\n' "${default_policy}" | vault_cmd policy write default -
|
||||
}
|
||||
|
||||
log "configuring kubernetes auth"
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
if [ -n "${k8s_issuer}" ]; then
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
token_reviewer_jwt="${token_reviewer_jwt}" \
|
||||
kubernetes_host="${k8s_host}" \
|
||||
kubernetes_ca_cert="${k8s_ca}" \
|
||||
issuer="${k8s_issuer}" \
|
||||
disable_iss_validation="${disable_iss_validation}"
|
||||
else
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
token_reviewer_jwt="${token_reviewer_jwt}" \
|
||||
kubernetes_host="${k8s_host}" \
|
||||
kubernetes_ca_cert="${k8s_ca}"
|
||||
fi
|
||||
|
||||
ensure_default_policy_login
|
||||
|
||||
write_raw_policy() {
|
||||
name="$1"
|
||||
@ -87,6 +114,7 @@ write_policy_and_role() {
|
||||
service_accounts="$3"
|
||||
read_paths="$4"
|
||||
write_paths="$5"
|
||||
audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
|
||||
|
||||
policy_body=""
|
||||
for path in ${read_paths}; do
|
||||
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
|
||||
}
|
||||
"
|
||||
done
|
||||
if [ "${role}" = "maintenance" ]; then
|
||||
policy_body="${policy_body}
|
||||
path \"sys/auth\" {
|
||||
capabilities = [\"read\"]
|
||||
}
|
||||
path \"sys/auth/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
|
||||
}
|
||||
path \"auth/kubernetes/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
path \"auth/oidc/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
path \"sys/policies/acl\" {
|
||||
capabilities = [\"list\"]
|
||||
}
|
||||
path \"sys/policies/acl/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
"
|
||||
fi
|
||||
|
||||
log "writing policy ${role}"
|
||||
printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
|
||||
|
||||
log "writing role ${role}"
|
||||
if [ -n "${audiences}" ]; then
|
||||
vault_cmd write "auth/kubernetes/role/${role}" \
|
||||
bound_service_account_audiences="${audiences}" \
|
||||
bound_service_account_names="${service_accounts}" \
|
||||
bound_service_account_namespaces="${namespace}" \
|
||||
policies="${role}" \
|
||||
ttl="${role_ttl}"
|
||||
return
|
||||
fi
|
||||
vault_cmd write "auth/kubernetes/role/${role}" \
|
||||
bound_service_account_names="${service_accounts}" \
|
||||
bound_service_account_namespaces="${namespace}" \
|
||||
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
|
||||
"nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
|
||||
write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
|
||||
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
|
||||
write_policy_and_role "ai" "ai" "atlasbot" \
|
||||
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
|
||||
write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
|
||||
"jenkins/* shared/harbor-pull" ""
|
||||
write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
|
||||
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
|
||||
write_policy_and_role "health" "health" "health-vault-sync" \
|
||||
"health/*" ""
|
||||
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
|
||||
"maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
|
||||
"maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
|
||||
write_policy_and_role "finance" "finance" "finance-vault" \
|
||||
"finance/* shared/postmark-relay" ""
|
||||
write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user