Compare commits
383 Commits
main
...
feature/at
| Author | SHA1 | Date | |
|---|---|---|---|
| fb5064fa17 | |||
| e29b31ff42 | |||
| ab33af3401 | |||
| f275764b15 | |||
|
|
d082bee3bc | ||
| b1ecc3eef8 | |||
|
|
6e6beb071b | ||
| c9a9c801ec | |||
| 4edc888246 | |||
| 917ee077ad | |||
| 8c931f6a58 | |||
| 7ec1b812d6 | |||
| 62376138dd | |||
| 0bab0deedf | |||
| 411ad0e4ba | |||
| 7c419748b7 | |||
| c901a0a0cb | |||
| 6df05c9adc | |||
| 578ccd97e5 | |||
| 5e0c5b200c | |||
| 28278d6c67 | |||
| e3ab256336 | |||
|
|
dd02a49626 | ||
| d430a480f0 | |||
| 0d09492984 | |||
| 45b2c79c72 | |||
|
|
aad8e11b37 | ||
| cfebff5f08 | |||
| e463674ca9 | |||
|
|
a6ceaa4cf1 | ||
|
|
64b70bf391 | ||
| 543880d06f | |||
|
|
873f392b88 | ||
|
|
286925857a | ||
| 36311b877b | |||
| cd2e2dff17 | |||
| b5a357d477 | |||
| bda5871035 | |||
| 89490d5aa5 | |||
|
|
97de9b6d18 | ||
|
|
e5ceb234c3 | ||
|
|
a714c9994a | ||
| ced6d511ff | |||
|
|
e0600baa4b | ||
|
|
557ccb7bbd | ||
| c16113088e | |||
|
|
18524a0065 | ||
|
|
0a7e05a735 | ||
|
|
14d90298e8 | ||
|
|
2523ebee2a | ||
|
|
76f27b7eed | ||
|
|
4abf16687b | ||
|
|
b20922b3ec | ||
|
|
6007050545 | ||
|
|
1b9c78166e | ||
|
|
b8c5f547aa | ||
|
|
8bc999a7f2 | ||
|
|
93cb39cd23 | ||
|
|
6c84d63500 | ||
|
|
9b341a865d | ||
|
|
66541c29ca | ||
|
|
00c8be0dd8 | ||
|
|
57d672c264 | ||
|
|
3252409a7b | ||
|
|
e656120be9 | ||
|
|
49151ad13e | ||
|
|
9b7778f193 | ||
|
|
56cd01f4d1 | ||
|
|
c6c1ec9129 | ||
|
|
3a39e0972e | ||
|
|
5d87aefc4b | ||
|
|
cf36ed6279 | ||
|
|
d88648bdf8 | ||
|
|
d15779e6dc | ||
|
|
2992b8c581 | ||
|
|
0cc49081ff | ||
|
|
e6b8e4d39e | ||
|
|
cb42182358 | ||
|
|
12b81b2f0d | ||
|
|
454017d7ea | ||
|
|
4ebf2ad742 | ||
|
|
999be05fd9 | ||
|
|
cdb94ee7a4 | ||
|
|
e259ab8a8d | ||
|
|
8630e626fe | ||
|
|
3d655dda4f | ||
|
|
0f935f7a78 | ||
|
|
e4629ec198 | ||
|
|
30b024dfc1 | ||
|
|
194404619b | ||
|
|
77919cbf20 | ||
|
|
51db4e0612 | ||
|
|
42adbe98c0 | ||
|
|
65e3947f5a | ||
|
|
2b52d07f95 | ||
|
|
ce020e06c0 | ||
|
|
f5437db369 | ||
|
|
455a58b982 | ||
|
|
3e044ed3fc | ||
|
|
68d794c909 | ||
|
|
709ec5d039 | ||
|
|
33b9d678c1 | ||
|
|
c2ffad3937 | ||
|
|
ed73f69d60 | ||
|
|
773a9526dc | ||
| 80d7c585e1 | |||
|
|
1272357177 | ||
|
|
47e2d706c4 | ||
| ca5393bf4c | |||
| dc83ead648 | |||
| 72d3dffd1e | |||
| a2833f3c26 | |||
|
|
98c5981869 | ||
|
|
53088cc82d | ||
| 0ae534e387 | |||
|
|
e5fbc8f6ed | ||
|
|
791a14a9e5 | ||
|
|
9f29205201 | ||
|
|
bae9b1bfc2 | ||
| 44f376b492 | |||
|
|
e0410cfa33 | ||
|
|
45b86a3478 | ||
| 8523f7bc91 | |||
| f2b8f79a7a | |||
| d6a4c7f888 | |||
| 7bd069cb3b | |||
|
|
80e94c7d67 | ||
| 157c93f2a9 | |||
|
|
c259c5abe4 | ||
|
|
f1d628682b | ||
|
|
d375d8a680 | ||
|
|
9c297f6609 | ||
| 97b2385aa2 | |||
| 3cdde19de0 | |||
|
|
af99e0e315 | ||
|
|
7905da3b9a | ||
|
|
0bff5b0835 | ||
| 69744225bb | |||
|
|
b15c9a6a63 | ||
|
|
23a67f0ddf | ||
| f8c04770a3 | |||
|
|
e489ffca7c | ||
| 80cb818fa2 | |||
|
|
d295eec276 | ||
| 896b4c4890 | |||
|
|
82f9147c7f | ||
| 1f476b8541 | |||
|
|
d80e6bb6b6 | ||
| 9fe547aa09 | |||
| 090a22a0b5 | |||
| 05d6ee9d6e | |||
| a7c1774044 | |||
| b3b4cbecdd | |||
| 13f59fb5e7 | |||
| fa8777d056 | |||
| 4f2ae810a5 | |||
|
|
382ccfe0f1 | ||
|
|
af61d2109d | ||
|
|
c98b69e368 | ||
| ccd92f6014 | |||
|
|
ab672773dd | ||
|
|
d1a490a80a | ||
|
|
a9d235695a | ||
| 1f19ae46f5 | |||
|
|
f833b61a76 | ||
| b9d660fc9a | |||
| 300b13f995 | |||
| 17d8ca3b2a | |||
|
|
27c7aade1c | ||
| dae28077b5 | |||
| b202dacfb1 | |||
| d72d21268b | |||
| 58dc219452 | |||
| 7c6a731c7c | |||
| f0259086fd | |||
|
|
c3cf0e7900 | ||
|
|
a0e52401fc | ||
|
|
51fc85587a | ||
| 5d9526af73 | |||
|
|
b537a7def8 | ||
| 2f2a6f9132 | |||
|
|
01d0a16210 | ||
| d1ac654e99 | |||
|
|
42a8c48426 | ||
| 0cc155cfcc | |||
| c7189bff8c | |||
| 3ca0df9529 | |||
|
|
1c9259b5b4 | ||
|
|
387bfadc76 | ||
| 50bdd18c56 | |||
| 19d22abd0f | |||
| fd8396730c | |||
|
|
dcabfb2ebb | ||
|
|
5d5b5c031e | ||
|
|
d79cce72af | ||
| 9147d4107f | |||
|
|
a21e58ad2c | ||
|
|
10db6b4973 | ||
| 7c6a91d758 | |||
|
|
ced41aa633 | ||
| cb2026b511 | |||
|
|
38d826e4ea | ||
|
|
4a30c5c706 | ||
| c43efe20dc | |||
| a80047060f | |||
|
|
00b84f3b89 | ||
|
|
06a559c7ea | ||
| eae548bdd1 | |||
|
|
8024db9d11 | ||
|
|
5dcbef6be3 | ||
|
|
f3420b79bf | ||
|
|
e7588ac06f | ||
|
|
9272dcf0e6 | ||
| 76e44e949c | |||
| 0915ed2205 | |||
|
|
187c0c2c88 | ||
|
|
ae12b18d6f | ||
| 080e583cf9 | |||
|
|
a1b4c878c2 | ||
| 22bfb15bd6 | |||
|
|
8948523474 | ||
| 303defd745 | |||
|
|
5fb9e77634 | ||
| 454a759cda | |||
|
|
26ff18e983 | ||
| 0b77c3bc2c | |||
|
|
58eecc72bd | ||
| 2e0ac03458 | |||
|
|
8e74a4c0b3 | ||
| 0aed7fe3ee | |||
|
|
867172600f | ||
|
|
d915a2a9d7 | ||
| 4e92168abf | |||
| ddf51be7aa | |||
|
|
20447bb33b | ||
| 5863e09ec8 | |||
| 0cf33797ae | |||
| 71fbd8a6c1 | |||
| d1f06aeb1b | |||
|
|
ce39b3ae98 | ||
| 6021f2f283 | |||
|
|
4629548f99 | ||
| 1e0a2628b6 | |||
|
|
8594b736ec | ||
|
|
9df8ae2d38 | ||
|
|
d890b09dcd | ||
| 080b94aa8b | |||
|
|
d7070647aa | ||
|
|
3b321c5320 | ||
| 3e2699758f | |||
| 2c4ad486a5 | |||
|
|
85f749a481 | ||
| 75d3946276 | |||
| e95ea959bb | |||
|
|
63bf109acb | ||
| 982b401a8c | |||
| f4684092be | |||
|
|
97f0f0b508 | ||
| f1f0543885 | |||
|
|
241c305034 | ||
|
|
444cd3a04b | ||
|
|
f11329ad02 | ||
| af055bacb4 | |||
| da08a2687d | |||
| e733ee64da | |||
| 3ef7eed9a9 | |||
| 6a11b0fcfb | |||
| c4f404df23 | |||
|
|
ae2f7cadeb | ||
| 3c63722a4b | |||
|
|
86c859bc14 | ||
|
|
7d0537f2a2 | ||
|
|
a8424251aa | ||
|
|
180f814520 | ||
|
|
0c3d2f918d | ||
|
|
20b3193cb8 | ||
| a851e184ca | |||
|
|
a5d4c63cd3 | ||
|
|
27a8a701ef | ||
|
|
0ee2b8b059 | ||
|
|
beefc5ab31 | ||
| 92d557ed98 | |||
|
|
ed9a84aaa6 | ||
| f5b6c7ed97 | |||
|
|
22c90e301e | ||
| fadf071b31 | |||
|
|
5c69ea62a2 | ||
| 04550a116d | |||
|
|
05005ac676 | ||
| 215edef09d | |||
| f2b0f76d15 | |||
|
|
f3f80ee114 | ||
| 9affb59632 | |||
| 25c0202743 | |||
| bc9ee3138a | |||
| 8ee96e02c4 | |||
| b476f757f4 | |||
| d5cbc823c6 | |||
| 70048e1081 | |||
| 7b5ac0fbb5 | |||
| 96b1ceeb97 | |||
|
|
64782af56c | ||
| ce3332503b | |||
| 8351945510 | |||
| 7152829271 | |||
|
|
b68b778532 | ||
|
|
61354ef81b | ||
|
|
939e5cff89 | ||
|
|
c630a399ac | ||
|
|
79999c959b | ||
|
|
7f7a0e1a52 | ||
|
|
8e572bf910 | ||
| 97779b655d | |||
|
|
046cc509d9 | ||
|
|
254cc446af | ||
|
|
a8636be9a1 | ||
| ad4cf69498 | |||
|
|
c7fc5fd411 | ||
| 2828b19cf7 | |||
|
|
fbb1df323b | ||
|
|
150036626b | ||
| 91e6d5740d | |||
|
|
a108590d7a | ||
|
|
22c53aebdd | ||
|
|
8325797b3c | ||
|
|
5498e7d91b | ||
| bc817a3c6b | |||
|
|
a39bc77344 | ||
|
|
8ff45092cb | ||
|
|
a8677f36c6 | ||
|
|
6391633484 | ||
|
|
cacf4427e9 | ||
|
|
9a5e20bd5c | ||
|
|
aec7fa43d6 | ||
|
|
74cd39d183 | ||
| d286d169a0 | |||
| 454c7fbd20 | |||
| 2eca794ccc | |||
| 9787c19fac | |||
| 1aceaed741 | |||
| 9662d36ad3 | |||
| caa6b73336 | |||
| b2c7ca8cf1 | |||
| 22670e4730 | |||
| 87c6e085a4 | |||
| bb0acd4f60 | |||
| 171356a351 | |||
| 250fe22288 | |||
| 433f3ef3d6 | |||
| 1d248bf91a | |||
| 06cda3f540 | |||
| 55aa8eb0bb | |||
| 5e24ec17c9 | |||
| 5cf843cb6a | |||
|
|
ee6a6fae8d | ||
| baaa5dc79f | |||
| ebf0e4faaa | |||
| a20f3cc0ce | |||
| 78c72a71a2 | |||
| 26e35ffbaf | |||
| fb2feec1b5 | |||
| f94926a387 | |||
|
|
ded56ccf89 | ||
| 44788b3132 | |||
|
|
f913956d08 | ||
| cead8d3561 | |||
| 88a4e93194 | |||
| 976b5994dd | |||
| 1186722bb5 | |||
|
|
1f660f9dd5 | ||
|
|
1c480edb47 | ||
| 55d01ee539 | |||
|
|
d56438fe06 | ||
| cfa2b4c08b | |||
|
|
47bce5483c | ||
| ee3543e70d | |||
|
|
a97b58bf2d | ||
| 780cdd6c1b | |||
|
|
5cec8cbe32 | ||
| d9e34582ec | |||
| 2d0e1fab34 | |||
| 6350a07cc5 | |||
| c4ecc07e58 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,6 +2,7 @@
|
||||
!README.md
|
||||
!knowledge/**/*.md
|
||||
!services/comms/knowledge/**/*.md
|
||||
!services/atlasbot/knowledge/**/*.md
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.pytest_cache
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(atlasbot): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/atlasbot
|
||||
@ -0,0 +1,17 @@
|
||||
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
path: ./services/atlasbot
|
||||
targetNamespace: ai
|
||||
timeout: 2m
|
||||
dependsOn:
|
||||
- name: ai-llm
|
||||
@ -13,14 +13,14 @@ spec:
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(bstein-dev-home): automated image update"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/bstein-dev-home
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: comms
|
||||
namespace: comms
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(comms): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/comms
|
||||
@ -6,6 +6,9 @@ resources:
|
||||
- vault/kustomization.yaml
|
||||
- vaultwarden/kustomization.yaml
|
||||
- comms/kustomization.yaml
|
||||
- comms/image-automation.yaml
|
||||
- atlasbot/kustomization.yaml
|
||||
- atlasbot/image-automation.yaml
|
||||
- crypto/kustomization.yaml
|
||||
- monerod/kustomization.yaml
|
||||
- pegasus/kustomization.yaml
|
||||
|
||||
@ -9,7 +9,7 @@ metadata:
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
secretRef:
|
||||
name: flux-system-gitea
|
||||
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
|
||||
@ -16,5 +16,6 @@ resources:
|
||||
- longhorn/kustomization.yaml
|
||||
- longhorn-ui/kustomization.yaml
|
||||
- postgres/kustomization.yaml
|
||||
- nats/kustomization.yaml
|
||||
- ../platform/vault-csi/kustomization.yaml
|
||||
- ../platform/vault-injector/kustomization.yaml
|
||||
|
||||
@ -13,14 +13,14 @@ spec:
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
email: ops@bstein.dev
|
||||
name: flux-bot
|
||||
messageTemplate: "chore(maintenance): automated image update"
|
||||
push:
|
||||
branch: feature/ariadne
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
strategy: Setters
|
||||
path: services/maintenance
|
||||
|
||||
21
clusters/atlas/flux-system/platform/nats/kustomization.yaml
Normal file
21
clusters/atlas/flux-system/platform/nats/kustomization.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./infrastructure/nats
|
||||
prune: true
|
||||
force: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
targetNamespace: nats
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
name: nats
|
||||
namespace: nats
|
||||
wait: true
|
||||
3
dockerfiles/Dockerfile.synapse-admin-ensure
Normal file
3
dockerfiles/Dockerfile.synapse-admin-ensure
Normal file
@ -0,0 +1,3 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN pip install --no-cache-dir psycopg2-binary bcrypt
|
||||
@ -6,6 +6,7 @@ resources:
|
||||
- ../modules/profiles/atlas-ha
|
||||
- coredns-custom.yaml
|
||||
- coredns-deployment.yaml
|
||||
- longhorn-node-taints.yaml
|
||||
- ntp-sync-daemonset.yaml
|
||||
- ../sources/cert-manager/letsencrypt.yaml
|
||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||
|
||||
40
infrastructure/core/longhorn-node-taints.yaml
Normal file
40
infrastructure/core/longhorn-node-taints.yaml
Normal file
@ -0,0 +1,40 @@
|
||||
# infrastructure/core/longhorn-node-taints.yaml
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-13
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-15
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-17
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: titan-19
|
||||
spec:
|
||||
taints:
|
||||
- key: longhorn
|
||||
value: "true"
|
||||
effect: PreferNoSchedule
|
||||
10
infrastructure/longhorn/core/backup-target.yaml
Normal file
10
infrastructure/longhorn/core/backup-target.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# infrastructure/longhorn/core/backup-target.yaml
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: BackupTarget
|
||||
metadata:
|
||||
name: default
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
backupTargetURL: "s3://atlas-soteria@us-west-004/"
|
||||
credentialSecret: longhorn-backup-b2
|
||||
pollInterval: 5m0s
|
||||
@ -6,6 +6,7 @@ resources:
|
||||
- vault-serviceaccount.yaml
|
||||
- secretproviderclass.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- backup-target.yaml
|
||||
- helmrelease.yaml
|
||||
- longhorn-settings-ensure-job.yaml
|
||||
|
||||
|
||||
@ -13,9 +13,27 @@ spec:
|
||||
- objectName: "harbor-pull__dockerconfigjson"
|
||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
||||
secretKey: "dockerconfigjson"
|
||||
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_ACCESS_KEY_ID"
|
||||
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_SECRET_ACCESS_KEY"
|
||||
- objectName: "longhorn_backup__AWS_ENDPOINTS"
|
||||
secretPath: "kv/data/atlas/longhorn/backup-b2"
|
||||
secretKey: "AWS_ENDPOINTS"
|
||||
secretObjects:
|
||||
- secretName: longhorn-registry
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
- objectName: harbor-pull__dockerconfigjson
|
||||
key: .dockerconfigjson
|
||||
- secretName: longhorn-backup-b2
|
||||
type: Opaque
|
||||
data:
|
||||
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
|
||||
key: AWS_ACCESS_KEY_ID
|
||||
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
|
||||
key: AWS_SECRET_ACCESS_KEY
|
||||
- objectName: longhorn_backup__AWS_ENDPOINTS
|
||||
key: AWS_ENDPOINTS
|
||||
|
||||
17
infrastructure/nats/configmap.yaml
Normal file
17
infrastructure/nats/configmap.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: nats-config
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
component: config
|
||||
annotations:
|
||||
description: "NATS JetStream configuration"
|
||||
data:
|
||||
nats.conf: |
|
||||
jetstream {
|
||||
store_dir: /data
|
||||
max_mem_store: 128MB
|
||||
max_file_store: 1GB
|
||||
}
|
||||
7
infrastructure/nats/kustomization.yaml
Normal file
7
infrastructure/nats/kustomization.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- configmap.yaml
|
||||
- service.yaml
|
||||
- statefulset.yaml
|
||||
4
infrastructure/nats/namespace.yaml
Normal file
4
infrastructure/nats/namespace.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: nats
|
||||
17
infrastructure/nats/service.yaml
Normal file
17
infrastructure/nats/service.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
selector:
|
||||
app: nats
|
||||
ports:
|
||||
- name: client
|
||||
port: 4222
|
||||
targetPort: 4222
|
||||
- name: monitoring
|
||||
port: 8222
|
||||
targetPort: 8222
|
||||
54
infrastructure/nats/statefulset.yaml
Normal file
54
infrastructure/nats/statefulset.yaml
Normal file
@ -0,0 +1,54 @@
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: nats
|
||||
namespace: nats
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
serviceName: nats
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nats
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
containers:
|
||||
- name: nats
|
||||
image: nats:2.10.18
|
||||
args:
|
||||
- "-c"
|
||||
- "/etc/nats/nats.conf"
|
||||
ports:
|
||||
- name: client
|
||||
containerPort: 4222
|
||||
- name: monitoring
|
||||
containerPort: 8222
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/nats
|
||||
- name: data
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: nats-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
|
||||
}
|
||||
|
||||
NAMESPACE_CPU_WINDOW = "1m"
|
||||
GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cluster metadata
|
||||
@ -235,13 +236,16 @@ def gpu_util_by_hostname():
|
||||
|
||||
|
||||
def gpu_node_labels():
|
||||
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
|
||||
return (
|
||||
f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
|
||||
' or kube_node_labels{label_jetson="true"}'
|
||||
)
|
||||
|
||||
|
||||
def gpu_requests_by_namespace_node(scope_var):
|
||||
return (
|
||||
"sum by (namespace,node) ("
|
||||
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
||||
f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
|
||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
||||
f"* on(node) group_left() ({gpu_node_labels()})"
|
||||
")"
|
||||
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
return (
|
||||
"sum by (namespace) ("
|
||||
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
||||
f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
|
||||
f"* on(node) group_left() ({gpu_util_by_node()})"
|
||||
")"
|
||||
)
|
||||
|
||||
@ -539,9 +539,9 @@ def main() -> int:
|
||||
help="Write generated files (otherwise just print a summary).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--sync-comms",
|
||||
"--sync-atlasbot",
|
||||
action="store_true",
|
||||
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
|
||||
help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
@ -632,10 +632,10 @@ def main() -> int:
|
||||
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
|
||||
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
|
||||
|
||||
if args.sync_comms:
|
||||
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
|
||||
_sync_tree(out_dir, comms_dir)
|
||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
|
||||
if args.sync_atlasbot:
|
||||
atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
|
||||
_sync_tree(out_dir, atlasbot_dir)
|
||||
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
labels:
|
||||
app: atlasbot
|
||||
spec:
|
||||
@ -18,7 +18,7 @@ spec:
|
||||
annotations:
|
||||
checksum/atlasbot-configmap: manual-atlasbot-101
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "comms"
|
||||
vault.hashicorp.com/role: "ai"
|
||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||
vault.hashicorp.com/agent-inject-template-turn-secret: |
|
||||
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
|
||||
@ -28,6 +28,15 @@ spec:
|
||||
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-seeder-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
|
||||
@ -58,17 +67,17 @@ spec:
|
||||
hardware: rpi5
|
||||
containers:
|
||||
- name: atlasbot
|
||||
image: python:3.11-slim
|
||||
image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
|
||||
command: ["/bin/sh","-c"]
|
||||
args:
|
||||
- |
|
||||
. /vault/scripts/comms_vault_env.sh
|
||||
exec python /app/bot.py
|
||||
. /vault/scripts/atlasbot_vault_env.sh
|
||||
exec python -m atlasbot.main
|
||||
env:
|
||||
- name: MATRIX_BASE
|
||||
value: http://othrys-synapse-matrix-synapse:8008
|
||||
value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
|
||||
- name: AUTH_BASE
|
||||
value: http://matrix-authentication-service:8080
|
||||
value: http://matrix-authentication-service.comms.svc.cluster.local:8080
|
||||
- name: KB_DIR
|
||||
value: /kb
|
||||
- name: VM_URL
|
||||
@ -76,27 +85,61 @@ spec:
|
||||
- name: ARIADNE_STATE_URL
|
||||
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_QUICK
|
||||
value: atlas-quick
|
||||
- name: BOT_USER_SMART
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_GENIUS
|
||||
value: atlas-genius
|
||||
- name: BOT_MENTIONS
|
||||
value: atlasbot,aatlasbot,atlas_quick,atlas_smart
|
||||
value: atlas-quick,atlas-smart,atlas-genius
|
||||
- name: OLLAMA_URL
|
||||
value: http://ollama.ai.svc.cluster.local:11434
|
||||
- name: OLLAMA_MODEL
|
||||
value: qwen2.5:14b-instruct
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_FAST
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_DEEP
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: ATLASBOT_MODEL_SMART
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: ATLASBOT_MODEL_GENIUS
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: OLLAMA_FALLBACK_MODEL
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: OLLAMA_TIMEOUT_SEC
|
||||
value: "600"
|
||||
- name: ATLASBOT_THINKING_INTERVAL_SEC
|
||||
value: "120"
|
||||
value: "30"
|
||||
- name: ATLASBOT_SNAPSHOT_TTL_SEC
|
||||
value: "30"
|
||||
- name: ATLASBOT_HTTP_PORT
|
||||
value: "8090"
|
||||
- name: ATLASBOT_STATE_DB
|
||||
value: /data/atlasbot_state.db
|
||||
- name: ATLASBOT_QUEUE_ENABLED
|
||||
value: "false"
|
||||
- name: ATLASBOT_DEBUG_PIPELINE
|
||||
value: "true"
|
||||
- name: ATLASBOT_NATS_URL
|
||||
value: nats://nats.nats.svc.cluster.local:4222
|
||||
- name: ATLASBOT_NATS_STREAM
|
||||
value: atlasbot
|
||||
- name: ATLASBOT_NATS_SUBJECT
|
||||
value: atlasbot.requests
|
||||
- name: ATLASBOT_FAST_MAX_ANGLES
|
||||
value: "2"
|
||||
- name: ATLASBOT_SMART_MAX_ANGLES
|
||||
value: "5"
|
||||
- name: ATLASBOT_FAST_MAX_CANDIDATES
|
||||
value: "2"
|
||||
- name: ATLASBOT_SMART_MAX_CANDIDATES
|
||||
value: "6"
|
||||
- name: ATLASBOT_FAST_LLM_CALLS_MAX
|
||||
value: "24"
|
||||
- name: ATLASBOT_SMART_LLM_CALLS_MAX
|
||||
value: "48"
|
||||
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
|
||||
value: "96"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8090
|
||||
@ -108,19 +151,15 @@ spec:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- name: code
|
||||
mountPath: /app/bot.py
|
||||
subPath: bot.py
|
||||
- name: kb
|
||||
mountPath: /kb
|
||||
readOnly: true
|
||||
- name: vault-scripts
|
||||
mountPath: /vault/scripts
|
||||
readOnly: true
|
||||
- name: atlasbot-state
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: code
|
||||
configMap:
|
||||
name: atlasbot
|
||||
- name: kb
|
||||
configMap:
|
||||
name: atlas-kb
|
||||
@ -139,5 +178,7 @@ spec:
|
||||
path: diagrams/atlas-http.mmd
|
||||
- name: vault-scripts
|
||||
configMap:
|
||||
name: comms-vault-env
|
||||
name: atlasbot-vault-env
|
||||
defaultMode: 0555
|
||||
- name: atlasbot-state
|
||||
emptyDir: {}
|
||||
@ -3,7 +3,9 @@ apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
imagePullSecrets:
|
||||
- name: harbor-regcred
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
@ -43,5 +45,4 @@ roleRef:
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
|
||||
namespace: ai
|
||||
@ -2,7 +2,7 @@ apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: comms
|
||||
namespace: ai
|
||||
labels:
|
||||
app: atlasbot
|
||||
spec:
|
||||
26
services/atlasbot/image-automation.yaml
Normal file
26
services/atlasbot/image-automation.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
# services/atlasbot/image-automation.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1
|
||||
kind: ImageUpdateAutomation
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
interval: 1m0s
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
git:
|
||||
checkout:
|
||||
ref:
|
||||
branch: feature/atlasbot
|
||||
commit:
|
||||
author:
|
||||
name: flux-bot
|
||||
email: ops@bstein.dev
|
||||
messageTemplate: "chore(atlasbot): automated image update"
|
||||
push:
|
||||
branch: feature/atlasbot
|
||||
update:
|
||||
path: services/atlasbot
|
||||
strategy: Setters
|
||||
23
services/atlasbot/image.yaml
Normal file
23
services/atlasbot/image.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
# services/comms/image.yaml
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/atlasbot
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: atlasbot
|
||||
namespace: ai
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: atlasbot
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
22
services/atlasbot/knowledge/INDEX.md
Normal file
22
services/atlasbot/knowledge/INDEX.md
Normal file
@ -0,0 +1,22 @@
|
||||
Atlas Knowledge Base (KB)
|
||||
|
||||
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
|
||||
- Accurate (grounded in GitOps + read-only cluster tools)
|
||||
- Maintainable (small docs + deterministic generators)
|
||||
- Safe (no secrets; refer to Secret/Vault paths by name only)
|
||||
|
||||
Layout
|
||||
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
|
||||
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
|
||||
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
|
||||
|
||||
Regeneration
|
||||
- Update manifests/docs, then regenerate generated artifacts:
|
||||
- `python scripts/knowledge_render_atlas.py --write`
|
||||
|
||||
Authoring rules
|
||||
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
|
||||
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
|
||||
- Keep each runbook small; one topic per file; use headings.
|
||||
- When in doubt, link to the exact file path in this repo that configures the behavior.
|
||||
|
||||
8
services/atlasbot/knowledge/catalog/atlas-summary.json
Normal file
8
services/atlasbot/knowledge/catalog/atlas-summary.json
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"counts": {
|
||||
"helmrelease_host_hints": 19,
|
||||
"http_endpoints": 45,
|
||||
"services": 47,
|
||||
"workloads": 74
|
||||
}
|
||||
}
|
||||
3445
services/atlasbot/knowledge/catalog/atlas.json
Normal file
3445
services/atlasbot/knowledge/catalog/atlas.json
Normal file
File diff suppressed because it is too large
Load Diff
1880
services/atlasbot/knowledge/catalog/metrics.json
Normal file
1880
services/atlasbot/knowledge/catalog/metrics.json
Normal file
File diff suppressed because it is too large
Load Diff
97
services/atlasbot/knowledge/catalog/runbooks.json
Normal file
97
services/atlasbot/knowledge/catalog/runbooks.json
Normal file
File diff suppressed because one or more lines are too long
234
services/atlasbot/knowledge/diagrams/atlas-http.mmd
Normal file
234
services/atlasbot/knowledge/diagrams/atlas-http.mmd
Normal file
@ -0,0 +1,234 @@
|
||||
flowchart LR
|
||||
host_auth_bstein_dev["auth.bstein.dev"]
|
||||
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
|
||||
host_auth_bstein_dev --> svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
|
||||
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
|
||||
host_bstein_dev["bstein.dev"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
|
||||
host_bstein_dev --> svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
|
||||
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
|
||||
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
|
||||
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
|
||||
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
|
||||
host_budget_bstein_dev["budget.bstein.dev"]
|
||||
svc_finance_actual_budget["finance/actual-budget (Service)"]
|
||||
host_budget_bstein_dev --> svc_finance_actual_budget
|
||||
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
|
||||
svc_finance_actual_budget --> wl_finance_actual_budget
|
||||
host_call_live_bstein_dev["call.live.bstein.dev"]
|
||||
svc_comms_element_call["comms/element-call (Service)"]
|
||||
host_call_live_bstein_dev --> svc_comms_element_call
|
||||
wl_comms_element_call["comms/element-call (Deployment)"]
|
||||
svc_comms_element_call --> wl_comms_element_call
|
||||
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
|
||||
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
|
||||
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
|
||||
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
|
||||
host_ci_bstein_dev["ci.bstein.dev"]
|
||||
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
|
||||
host_ci_bstein_dev --> svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
|
||||
svc_jenkins_jenkins --> wl_jenkins_jenkins
|
||||
host_cloud_bstein_dev["cloud.bstein.dev"]
|
||||
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
|
||||
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
|
||||
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
|
||||
host_health_bstein_dev["health.bstein.dev"]
|
||||
svc_health_wger["health/wger (Service)"]
|
||||
host_health_bstein_dev --> svc_health_wger
|
||||
wl_health_wger["health/wger (Deployment)"]
|
||||
svc_health_wger --> wl_health_wger
|
||||
host_kit_live_bstein_dev["kit.live.bstein.dev"]
|
||||
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
|
||||
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
|
||||
svc_comms_livekit["comms/livekit (Service)"]
|
||||
host_kit_live_bstein_dev --> svc_comms_livekit
|
||||
wl_comms_livekit["comms/livekit (Deployment)"]
|
||||
svc_comms_livekit --> wl_comms_livekit
|
||||
host_live_bstein_dev["live.bstein.dev"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
|
||||
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
|
||||
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
|
||||
host_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
|
||||
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
|
||||
host_logs_bstein_dev["logs.bstein.dev"]
|
||||
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
|
||||
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
|
||||
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
|
||||
host_longhorn_bstein_dev["longhorn.bstein.dev"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
|
||||
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
|
||||
host_mail_bstein_dev["mail.bstein.dev"]
|
||||
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
|
||||
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
|
||||
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
|
||||
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
|
||||
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
|
||||
host_monero_bstein_dev["monero.bstein.dev"]
|
||||
svc_crypto_monerod["crypto/monerod (Service)"]
|
||||
host_monero_bstein_dev --> svc_crypto_monerod
|
||||
wl_crypto_monerod["crypto/monerod (Deployment)"]
|
||||
svc_crypto_monerod --> wl_crypto_monerod
|
||||
host_money_bstein_dev["money.bstein.dev"]
|
||||
svc_finance_firefly["finance/firefly (Service)"]
|
||||
host_money_bstein_dev --> svc_finance_firefly
|
||||
wl_finance_firefly["finance/firefly (Deployment)"]
|
||||
svc_finance_firefly --> wl_finance_firefly
|
||||
host_notes_bstein_dev["notes.bstein.dev"]
|
||||
svc_outline_outline["outline/outline (Service)"]
|
||||
host_notes_bstein_dev --> svc_outline_outline
|
||||
wl_outline_outline["outline/outline (Deployment)"]
|
||||
svc_outline_outline --> wl_outline_outline
|
||||
host_office_bstein_dev["office.bstein.dev"]
|
||||
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
|
||||
host_office_bstein_dev --> svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
|
||||
svc_nextcloud_collabora --> wl_nextcloud_collabora
|
||||
host_pegasus_bstein_dev["pegasus.bstein.dev"]
|
||||
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
|
||||
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
|
||||
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
|
||||
host_scm_bstein_dev["scm.bstein.dev"]
|
||||
svc_gitea_gitea["gitea/gitea (Service)"]
|
||||
host_scm_bstein_dev --> svc_gitea_gitea
|
||||
wl_gitea_gitea["gitea/gitea (Deployment)"]
|
||||
svc_gitea_gitea --> wl_gitea_gitea
|
||||
host_secret_bstein_dev["secret.bstein.dev"]
|
||||
svc_vault_vault["vault/vault (Service)"]
|
||||
host_secret_bstein_dev --> svc_vault_vault
|
||||
wl_vault_vault["vault/vault (StatefulSet)"]
|
||||
svc_vault_vault --> wl_vault_vault
|
||||
host_sso_bstein_dev["sso.bstein.dev"]
|
||||
svc_sso_keycloak["sso/keycloak (Service)"]
|
||||
host_sso_bstein_dev --> svc_sso_keycloak
|
||||
wl_sso_keycloak["sso/keycloak (Deployment)"]
|
||||
svc_sso_keycloak --> wl_sso_keycloak
|
||||
host_stream_bstein_dev["stream.bstein.dev"]
|
||||
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
|
||||
host_stream_bstein_dev --> svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
|
||||
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
|
||||
host_tasks_bstein_dev["tasks.bstein.dev"]
|
||||
svc_planka_planka["planka/planka (Service)"]
|
||||
host_tasks_bstein_dev --> svc_planka_planka
|
||||
wl_planka_planka["planka/planka (Deployment)"]
|
||||
svc_planka_planka --> wl_planka_planka
|
||||
host_vault_bstein_dev["vault.bstein.dev"]
|
||||
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
|
||||
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
|
||||
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
|
||||
|
||||
subgraph bstein_dev_home[bstein-dev-home]
|
||||
svc_bstein_dev_home_bstein_dev_home_frontend
|
||||
wl_bstein_dev_home_bstein_dev_home_frontend
|
||||
svc_bstein_dev_home_bstein_dev_home_backend
|
||||
wl_bstein_dev_home_bstein_dev_home_backend
|
||||
svc_bstein_dev_home_chat_ai_gateway
|
||||
wl_bstein_dev_home_chat_ai_gateway
|
||||
end
|
||||
subgraph comms[comms]
|
||||
svc_comms_matrix_wellknown
|
||||
wl_comms_matrix_wellknown
|
||||
svc_comms_element_call
|
||||
wl_comms_element_call
|
||||
svc_comms_livekit_token_service
|
||||
wl_comms_livekit_token_service
|
||||
svc_comms_livekit
|
||||
wl_comms_livekit
|
||||
svc_comms_othrys_synapse_matrix_synapse
|
||||
svc_comms_matrix_guest_register
|
||||
wl_comms_matrix_guest_register
|
||||
svc_comms_matrix_authentication_service
|
||||
wl_comms_matrix_authentication_service
|
||||
end
|
||||
subgraph crypto[crypto]
|
||||
svc_crypto_monerod
|
||||
wl_crypto_monerod
|
||||
end
|
||||
subgraph finance[finance]
|
||||
svc_finance_actual_budget
|
||||
wl_finance_actual_budget
|
||||
svc_finance_firefly
|
||||
wl_finance_firefly
|
||||
end
|
||||
subgraph gitea[gitea]
|
||||
svc_gitea_gitea
|
||||
wl_gitea_gitea
|
||||
end
|
||||
subgraph health[health]
|
||||
svc_health_wger
|
||||
wl_health_wger
|
||||
end
|
||||
subgraph jellyfin[jellyfin]
|
||||
svc_jellyfin_pegasus
|
||||
wl_jellyfin_pegasus
|
||||
svc_jellyfin_jellyfin
|
||||
wl_jellyfin_jellyfin
|
||||
end
|
||||
subgraph jenkins[jenkins]
|
||||
svc_jenkins_jenkins
|
||||
wl_jenkins_jenkins
|
||||
end
|
||||
subgraph logging[logging]
|
||||
svc_logging_oauth2_proxy_logs
|
||||
wl_logging_oauth2_proxy_logs
|
||||
end
|
||||
subgraph longhorn_system[longhorn-system]
|
||||
svc_longhorn_system_oauth2_proxy_longhorn
|
||||
wl_longhorn_system_oauth2_proxy_longhorn
|
||||
end
|
||||
subgraph mailu_mailserver[mailu-mailserver]
|
||||
svc_mailu_mailserver_mailu_front
|
||||
end
|
||||
subgraph nextcloud[nextcloud]
|
||||
svc_nextcloud_nextcloud
|
||||
wl_nextcloud_nextcloud
|
||||
svc_nextcloud_collabora
|
||||
wl_nextcloud_collabora
|
||||
end
|
||||
subgraph outline[outline]
|
||||
svc_outline_outline
|
||||
wl_outline_outline
|
||||
end
|
||||
subgraph planka[planka]
|
||||
svc_planka_planka
|
||||
wl_planka_planka
|
||||
end
|
||||
subgraph sso[sso]
|
||||
svc_sso_oauth2_proxy
|
||||
wl_sso_oauth2_proxy
|
||||
svc_sso_keycloak
|
||||
wl_sso_keycloak
|
||||
end
|
||||
subgraph vault[vault]
|
||||
svc_vault_vault
|
||||
wl_vault_vault
|
||||
end
|
||||
subgraph vaultwarden[vaultwarden]
|
||||
svc_vaultwarden_vaultwarden_service
|
||||
wl_vaultwarden_vaultwarden
|
||||
end
|
||||
29
services/atlasbot/kustomization.yaml
Normal file
29
services/atlasbot/kustomization.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
# services/atlasbot/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: ai
|
||||
resources:
|
||||
- atlasbot-deployment.yaml
|
||||
- atlasbot-service.yaml
|
||||
- atlasbot-rbac.yaml
|
||||
- secretproviderclass.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- image.yaml
|
||||
- image-automation.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/atlasbot
|
||||
newTag: 0.1.2-97 # {"$imagepolicy": "ai:atlasbot:tag"}
|
||||
configMapGenerator:
|
||||
- name: atlasbot-vault-env
|
||||
files:
|
||||
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlas-kb
|
||||
files:
|
||||
- INDEX.md=knowledge/INDEX.md
|
||||
- atlas.json=knowledge/catalog/atlas.json
|
||||
- atlas-summary.json=knowledge/catalog/atlas-summary.json
|
||||
- metrics.json=knowledge/catalog/metrics.json
|
||||
- runbooks.json=knowledge/catalog/runbooks.json
|
||||
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
|
||||
44
services/atlasbot/scripts/atlasbot_vault_env.sh
Normal file
44
services/atlasbot/scripts/atlasbot_vault_env.sh
Normal file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
vault_dir="/vault/secrets"
|
||||
|
||||
read_secret() {
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
}
|
||||
|
||||
read_optional() {
|
||||
if [ -f "${vault_dir}/$1" ]; then
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
else
|
||||
printf ''
|
||||
fi
|
||||
}
|
||||
|
||||
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
|
||||
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
|
||||
|
||||
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
|
||||
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
|
||||
|
||||
export BOT_PASS="$(read_secret bot-pass)"
|
||||
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
|
||||
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
|
||||
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
|
||||
if [ -z "${BOT_PASS_SMART}" ]; then
|
||||
export BOT_PASS_SMART="${BOT_PASS}"
|
||||
fi
|
||||
if [ -z "${BOT_PASS_GENIUS}" ]; then
|
||||
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
|
||||
fi
|
||||
export SEEDER_PASS="$(read_secret seeder-pass)"
|
||||
|
||||
export CHAT_API_KEY="$(read_secret chat-matrix)"
|
||||
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
|
||||
|
||||
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
|
||||
export PGPASSWORD="$(read_secret synapse-db-pass)"
|
||||
|
||||
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
|
||||
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
|
||||
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"
|
||||
21
services/atlasbot/secretproviderclass.yaml
Normal file
21
services/atlasbot/secretproviderclass.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
# services/atlasbot/secretproviderclass.yaml
|
||||
apiVersion: secrets-store.csi.x-k8s.io/v1
|
||||
kind: SecretProviderClass
|
||||
metadata:
|
||||
name: atlasbot-vault
|
||||
namespace: ai
|
||||
spec:
|
||||
provider: vault
|
||||
parameters:
|
||||
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
|
||||
roleName: "ai"
|
||||
objects: |
|
||||
- objectName: "harbor-pull__dockerconfigjson"
|
||||
secretPath: "kv/data/atlas/shared/harbor-pull"
|
||||
secretKey: "dockerconfigjson"
|
||||
secretObjects:
|
||||
- secretName: harbor-regcred
|
||||
type: kubernetes.io/dockerconfigjson
|
||||
data:
|
||||
- objectName: harbor-pull__dockerconfigjson
|
||||
key: .dockerconfigjson
|
||||
34
services/atlasbot/vault-sync-deployment.yaml
Normal file
34
services/atlasbot/vault-sync-deployment.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
# services/atlasbot/vault-sync-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: atlasbot-vault-sync
|
||||
namespace: ai
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: atlasbot-vault-sync
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: atlasbot-vault-sync
|
||||
spec:
|
||||
serviceAccountName: atlasbot
|
||||
containers:
|
||||
- name: sync
|
||||
image: alpine:3.20
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- "sleep infinity"
|
||||
volumeMounts:
|
||||
- name: vault-secrets
|
||||
mountPath: /vault/secrets
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: vault-secrets
|
||||
csi:
|
||||
driver: secrets-store.csi.k8s.io
|
||||
readOnly: true
|
||||
volumeAttributes:
|
||||
secretProviderClass: atlasbot-vault
|
||||
@ -68,7 +68,11 @@ spec:
|
||||
- name: AI_CHAT_TIMEOUT_SEC
|
||||
value: "480"
|
||||
- name: AI_ATLASBOT_ENDPOINT
|
||||
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
|
||||
value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
|
||||
- name: AI_ATLASBOT_MODEL_FAST
|
||||
value: qwen2.5:14b-instruct-q4_0
|
||||
- name: AI_ATLASBOT_MODEL_SMART
|
||||
value: qwen2.5:14b-instruct
|
||||
- name: AI_ATLASBOT_TIMEOUT_SEC
|
||||
value: "30"
|
||||
- name: AI_NODE_NAME
|
||||
|
||||
@ -20,9 +20,9 @@ resources:
|
||||
- ingress.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
|
||||
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
|
||||
newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
|
||||
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
|
||||
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
|
||||
newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
|
||||
configMapGenerator:
|
||||
- name: chat-ai-gateway
|
||||
namespace: bstein-dev-home
|
||||
|
||||
@ -13,10 +13,7 @@ resources:
|
||||
- element-call-deployment.yaml
|
||||
- guest-register-deployment.yaml
|
||||
- guest-register-service.yaml
|
||||
- atlasbot-deployment.yaml
|
||||
- atlasbot-service.yaml
|
||||
- wellknown.yaml
|
||||
- atlasbot-rbac.yaml
|
||||
- mas-secrets-ensure-rbac.yaml
|
||||
- comms-secrets-ensure-rbac.yaml
|
||||
- mas-db-ensure-rbac.yaml
|
||||
@ -43,7 +40,6 @@ resources:
|
||||
- livekit-ingress.yaml
|
||||
- livekit-middlewares.yaml
|
||||
- matrix-ingress.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: comms-vault-env
|
||||
files:
|
||||
@ -60,21 +56,8 @@ configMapGenerator:
|
||||
- server.py=scripts/guest-register/server.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlasbot
|
||||
files:
|
||||
- bot.py=scripts/atlasbot/bot.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: othrys-element-host-config
|
||||
files:
|
||||
- 20-host-config.sh=scripts/element-host-config.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: atlas-kb
|
||||
files:
|
||||
- INDEX.md=knowledge/INDEX.md
|
||||
- atlas.json=knowledge/catalog/atlas.json
|
||||
- atlas-summary.json=knowledge/catalog/atlas-summary.json
|
||||
- metrics.json=knowledge/catalog/metrics.json
|
||||
- runbooks.json=knowledge/catalog/runbooks.json
|
||||
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/comms/oneoffs/comms-secrets-ensure-job.yaml
|
||||
# One-off job for comms/comms-secrets-ensure-7.
|
||||
# Purpose: comms secrets ensure 7 (see container args/env in this file).
|
||||
# One-off job for comms/comms-secrets-ensure-8.
|
||||
# Purpose: comms secrets ensure 8 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: comms-secrets-ensure-7
|
||||
name: comms-secrets-ensure-8
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
@ -87,6 +87,9 @@ spec:
|
||||
ensure_key "comms/synapse-redis" "redis-password" >/dev/null
|
||||
ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
|
||||
ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
|
||||
|
||||
SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/comms/oneoffs/mas-local-users-ensure-job.yaml
|
||||
# One-off job for comms/mas-local-users-ensure-18.
|
||||
# One-off job for comms/mas-local-users-ensure-19.
|
||||
# Purpose: mas local users ensure 18 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: mas-local-users-ensure-18
|
||||
name: mas-local-users-ensure-19
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
@ -27,6 +27,12 @@ spec:
|
||||
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
|
||||
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
|
||||
vault.hashicorp.com/agent-inject-template-seeder-pass: |
|
||||
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
|
||||
@ -92,7 +98,13 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_QUICK
|
||||
value: atlas-quick
|
||||
- name: BOT_USER_SMART
|
||||
value: atlas-smart
|
||||
- name: BOT_USER_GENIUS
|
||||
value: atlas-genius
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
@ -225,11 +237,27 @@ spec:
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
if r.status_code == 429:
|
||||
return False
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
|
||||
return True
|
||||
|
||||
wait_for_service(MAS_ADMIN_API_BASE)
|
||||
token = admin_token()
|
||||
bot_quick = os.environ.get("BOT_USER_QUICK", "")
|
||||
bot_smart = os.environ.get("BOT_USER_SMART", "")
|
||||
bot_genius = os.environ.get("BOT_USER_GENIUS", "")
|
||||
bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
|
||||
bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
|
||||
bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
|
||||
|
||||
ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
|
||||
ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
|
||||
if bot_quick and bot_quick_pass:
|
||||
ensure_user(token, bot_quick, bot_quick_pass)
|
||||
if bot_smart and bot_smart_pass:
|
||||
ensure_user(token, bot_smart, bot_smart_pass)
|
||||
if bot_genius and bot_genius_pass:
|
||||
ensure_user(token, bot_genius, bot_genius_pass)
|
||||
PY
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
# services/comms/oneoffs/synapse-admin-ensure-job.yaml
|
||||
# One-off job for comms/synapse-admin-ensure-3.
|
||||
# Purpose: synapse admin ensure 3 (see container args/env in this file).
|
||||
# One-off job for comms/synapse-admin-ensure-15.
|
||||
# Purpose: synapse admin ensure 15 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: synapse-admin-ensure-3
|
||||
name: synapse-admin-ensure-15
|
||||
namespace: comms
|
||||
spec:
|
||||
suspend: true
|
||||
suspend: false
|
||||
backoffLimit: 0
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
@ -32,7 +32,8 @@ spec:
|
||||
values: ["arm64"]
|
||||
containers:
|
||||
- name: ensure
|
||||
image: python:3.11-slim
|
||||
image: python:3.12-slim
|
||||
imagePullPolicy: Always
|
||||
env:
|
||||
- name: VAULT_ADDR
|
||||
value: http://vault.vault.svc.cluster.local:8200
|
||||
@ -45,22 +46,20 @@ spec:
|
||||
- -c
|
||||
- |
|
||||
set -euo pipefail
|
||||
pip install --no-cache-dir psycopg2-binary bcrypt
|
||||
python -m pip install --no-cache-dir psycopg2-binary
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
import string
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
import bcrypt
|
||||
import psycopg2
|
||||
|
||||
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
|
||||
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
|
||||
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
|
||||
PGHOST = "postgres-service.postgres.svc.cluster.local"
|
||||
PGPORT = 5432
|
||||
PGDATABASE = "synapse"
|
||||
@ -113,48 +112,15 @@ spec:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
resp.read()
|
||||
|
||||
def random_password(length: int = 32) -> str:
|
||||
alphabet = string.ascii_letters + string.digits
|
||||
return "".join(secrets.choice(alphabet) for _ in range(length))
|
||||
|
||||
def ensure_admin_creds(token: str) -> dict:
|
||||
data = vault_get(token, "comms/synapse-admin")
|
||||
username = (data.get("username") or "").strip() or "synapse-admin"
|
||||
password = (data.get("password") or "").strip()
|
||||
if not password:
|
||||
password = random_password()
|
||||
username = "othrys-seeder"
|
||||
if data.get("username") != username:
|
||||
data["username"] = username
|
||||
data["password"] = password
|
||||
data.pop("access_token", None)
|
||||
vault_put(token, "comms/synapse-admin", data)
|
||||
return data
|
||||
|
||||
def ensure_user(cur, cols, user_id, password, admin):
|
||||
now_ms = int(time.time() * 1000)
|
||||
values = {
|
||||
"name": user_id,
|
||||
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
|
||||
"creation_ts": now_ms,
|
||||
}
|
||||
|
||||
def add_flag(name, flag):
|
||||
if name not in cols:
|
||||
return
|
||||
if cols[name]["type"] in ("smallint", "integer"):
|
||||
values[name] = int(flag)
|
||||
else:
|
||||
values[name] = bool(flag)
|
||||
|
||||
add_flag("admin", admin)
|
||||
add_flag("deactivated", False)
|
||||
add_flag("shadow_banned", False)
|
||||
add_flag("is_guest", False)
|
||||
|
||||
columns = list(values.keys())
|
||||
placeholders = ", ".join(["%s"] * len(columns))
|
||||
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
|
||||
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
|
||||
cur.execute(query, [values[c] for c in columns])
|
||||
|
||||
def get_cols(cur):
|
||||
cur.execute(
|
||||
"""
|
||||
@ -172,30 +138,40 @@ spec:
|
||||
}
|
||||
return cols
|
||||
|
||||
def ensure_access_token(cur, user_id, token_value):
|
||||
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
|
||||
token_id = cur.fetchone()[0]
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
|
||||
VALUES (%s, %s, %s, %s, NULL)
|
||||
ON CONFLICT (token) DO NOTHING
|
||||
""",
|
||||
(token_id, user_id, token_value, "ariadne-admin"),
|
||||
)
|
||||
def admin_token_valid(token: str, user_id: str) -> bool:
|
||||
if not token or not SYNAPSE_ADMIN_URL:
|
||||
return False
|
||||
encoded = urllib.parse.quote(user_id, safe="")
|
||||
url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
resp.read()
|
||||
return True
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
return True
|
||||
if exc.code in (401, 403):
|
||||
return False
|
||||
raise
|
||||
|
||||
vault_token = vault_login()
|
||||
admin_data = ensure_admin_creds(vault_token)
|
||||
if admin_data.get("access_token"):
|
||||
log("synapse admin token already present")
|
||||
user_id = f"@{admin_data['username']}:live.bstein.dev"
|
||||
existing_token = admin_data.get("access_token")
|
||||
if existing_token and admin_token_valid(existing_token, user_id):
|
||||
log("synapse admin token already present and valid")
|
||||
raise SystemExit(0)
|
||||
if existing_token:
|
||||
log("synapse admin token invalid; rotating")
|
||||
admin_data.pop("access_token", None)
|
||||
vault_put(vault_token, "comms/synapse-admin", admin_data)
|
||||
|
||||
synapse_db = vault_get(vault_token, "comms/synapse-db")
|
||||
pg_password = synapse_db.get("POSTGRES_PASSWORD")
|
||||
if not pg_password:
|
||||
raise RuntimeError("synapse db password missing")
|
||||
|
||||
user_id = f"@{admin_data['username']}:live.bstein.dev"
|
||||
conn = psycopg2.connect(
|
||||
host=PGHOST,
|
||||
port=PGPORT,
|
||||
@ -203,17 +179,34 @@ spec:
|
||||
user=PGUSER,
|
||||
password=pg_password,
|
||||
)
|
||||
token_value = secrets.token_urlsafe(32)
|
||||
try:
|
||||
with conn:
|
||||
with conn.cursor() as cur:
|
||||
cols = get_cols(cur)
|
||||
ensure_user(cur, cols, user_id, admin_data["password"], True)
|
||||
ensure_access_token(cur, user_id, token_value)
|
||||
if "admin" not in cols:
|
||||
raise RuntimeError("users.admin column missing")
|
||||
cur.execute(
|
||||
"UPDATE users SET admin = TRUE WHERE name = %s",
|
||||
(user_id,),
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT token FROM access_tokens
|
||||
WHERE user_id = %s AND valid_until_ms IS NULL
|
||||
ORDER BY id DESC LIMIT 1
|
||||
""",
|
||||
(user_id,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
raise RuntimeError(f"no access token found for {user_id}")
|
||||
token_value = row[0]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
admin_data["access_token"] = token_value
|
||||
vault_put(vault_token, "comms/synapse-admin", admin_data)
|
||||
if not admin_token_valid(token_value, user_id):
|
||||
raise RuntimeError("synapse admin token validation failed")
|
||||
log("synapse admin token stored")
|
||||
PY
|
||||
|
||||
@ -82,8 +82,6 @@ spec:
|
||||
value: synapse
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
@ -141,10 +139,8 @@ spec:
|
||||
cur.execute(query, [values[c] for c in columns])
|
||||
|
||||
seeder_user = os.environ["SEEDER_USER"]
|
||||
bot_user = os.environ["BOT_USER"]
|
||||
server = "live.bstein.dev"
|
||||
seeder_id = f"@{seeder_user}:{server}"
|
||||
bot_id = f"@{bot_user}:{server}"
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=os.environ["PGHOST"],
|
||||
@ -158,7 +154,6 @@ spec:
|
||||
with conn.cursor() as cur:
|
||||
cols = get_cols(cur)
|
||||
upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
|
||||
upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
|
||||
finally:
|
||||
conn.close()
|
||||
PY
|
||||
|
||||
@ -76,7 +76,7 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
|
||||
@ -11,8 +11,12 @@ from urllib import error, parse, request
|
||||
|
||||
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
|
||||
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
|
||||
USER = os.environ["BOT_USER"]
|
||||
PASSWORD = os.environ["BOT_PASS"]
|
||||
BOT_USER = os.environ["BOT_USER"]
|
||||
BOT_PASS = os.environ["BOT_PASS"]
|
||||
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
|
||||
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
|
||||
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
|
||||
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
|
||||
ROOM_ALIAS = "#othrys:live.bstein.dev"
|
||||
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
|
||||
@ -31,7 +35,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
|
||||
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
|
||||
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
|
||||
|
||||
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
|
||||
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
|
||||
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
|
||||
|
||||
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
|
||||
@ -393,6 +397,31 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
|
||||
return default
|
||||
|
||||
|
||||
def _detect_mode(
|
||||
content: dict[str, Any],
|
||||
body: str,
|
||||
*,
|
||||
default: str = "deep",
|
||||
account_user: str = "",
|
||||
) -> str:
|
||||
mode = _detect_mode_from_body(body, default=default)
|
||||
mentions = content.get("m.mentions", {})
|
||||
user_ids = mentions.get("user_ids", [])
|
||||
if isinstance(user_ids, list):
|
||||
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
|
||||
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
|
||||
return "fast"
|
||||
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
|
||||
return "deep"
|
||||
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
|
||||
return "deep"
|
||||
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
|
||||
return "fast"
|
||||
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
|
||||
return "deep"
|
||||
return mode
|
||||
|
||||
|
||||
def _model_for_mode(mode: str) -> str:
|
||||
if mode == "fast" and MODEL_FAST:
|
||||
return MODEL_FAST
|
||||
@ -416,12 +445,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
|
||||
raw = resp.read()
|
||||
return json.loads(raw.decode()) if raw else {}
|
||||
|
||||
def login() -> str:
|
||||
login_user = normalize_user_id(USER)
|
||||
def login(user: str, password: str) -> str:
|
||||
login_user = normalize_user_id(user)
|
||||
payload = {
|
||||
"type": "m.login.password",
|
||||
"identifier": {"type": "m.id.user", "user": login_user},
|
||||
"password": PASSWORD,
|
||||
"password": password,
|
||||
}
|
||||
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
|
||||
return res["access_token"]
|
||||
@ -4820,7 +4849,7 @@ def open_ended_with_thinking(
|
||||
thread.join(timeout=1)
|
||||
return result["reply"] or "Model backend is busy. Try again in a moment."
|
||||
|
||||
def sync_loop(token: str, room_id: str):
|
||||
def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
|
||||
since = None
|
||||
try:
|
||||
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
|
||||
@ -4861,7 +4890,7 @@ def sync_loop(token: str, room_id: str):
|
||||
if not body:
|
||||
continue
|
||||
sender = ev.get("sender", "")
|
||||
if sender == f"@{USER}:live.bstein.dev":
|
||||
if account_user and sender == normalize_user_id(account_user):
|
||||
continue
|
||||
|
||||
mentioned = is_mentioned(content, body)
|
||||
@ -4874,7 +4903,12 @@ def sync_loop(token: str, room_id: str):
|
||||
|
||||
cleaned_body = _strip_bot_mention(body)
|
||||
lower_body = cleaned_body.lower()
|
||||
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
|
||||
mode = _detect_mode(
|
||||
content,
|
||||
body,
|
||||
default=default_mode if default_mode in ("fast", "deep") else "deep",
|
||||
account_user=account_user,
|
||||
)
|
||||
|
||||
# Only do live cluster introspection in DMs.
|
||||
allow_tools = is_dm
|
||||
@ -4951,26 +4985,65 @@ def sync_loop(token: str, room_id: str):
|
||||
history[hist_key].append(f"Atlas: {reply}")
|
||||
history[hist_key] = history[hist_key][-80:]
|
||||
|
||||
def login_with_retry():
|
||||
def login_with_retry(user: str, password: str):
|
||||
last_err = None
|
||||
for attempt in range(10):
|
||||
try:
|
||||
return login()
|
||||
return login(user, password)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_err = exc
|
||||
time.sleep(min(30, 2 ** attempt))
|
||||
raise last_err
|
||||
|
||||
def _bot_accounts() -> list[dict[str, str]]:
|
||||
accounts: list[dict[str, str]] = []
|
||||
|
||||
def add(user: str, password: str, mode: str):
|
||||
if not user or not password:
|
||||
return
|
||||
accounts.append({"user": user, "password": password, "mode": mode})
|
||||
|
||||
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "deep")
|
||||
if BOT_USER_QUICK and BOT_PASS_QUICK:
|
||||
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
|
||||
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
|
||||
add(BOT_USER, BOT_PASS, "deep")
|
||||
|
||||
seen: set[str] = set()
|
||||
unique: list[dict[str, str]] = []
|
||||
for acc in accounts:
|
||||
uid = normalize_user_id(acc["user"]).lower()
|
||||
if uid in seen:
|
||||
continue
|
||||
seen.add(uid)
|
||||
unique.append(acc)
|
||||
return unique
|
||||
|
||||
def main():
|
||||
load_kb()
|
||||
_start_http_server()
|
||||
token = login_with_retry()
|
||||
accounts = _bot_accounts()
|
||||
threads: list[threading.Thread] = []
|
||||
for acc in accounts:
|
||||
token = login_with_retry(acc["user"], acc["password"])
|
||||
try:
|
||||
room_id = resolve_alias(token, ROOM_ALIAS)
|
||||
join_room(token, room_id)
|
||||
except Exception:
|
||||
room_id = None
|
||||
sync_loop(token, room_id)
|
||||
thread = threading.Thread(
|
||||
target=sync_loop,
|
||||
args=(token, room_id),
|
||||
kwargs={
|
||||
"account_user": acc["user"],
|
||||
"default_mode": acc["mode"],
|
||||
},
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -7,6 +7,14 @@ read_secret() {
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
}
|
||||
|
||||
read_optional() {
|
||||
if [ -f "${vault_dir}/$1" ]; then
|
||||
tr -d '\r\n' < "${vault_dir}/$1"
|
||||
else
|
||||
printf ''
|
||||
fi
|
||||
}
|
||||
|
||||
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
|
||||
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
|
||||
|
||||
@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
|
||||
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
|
||||
|
||||
export BOT_PASS="$(read_secret bot-pass)"
|
||||
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
|
||||
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
|
||||
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
|
||||
if [ -z "${BOT_PASS_SMART}" ]; then
|
||||
export BOT_PASS_SMART="${BOT_PASS}"
|
||||
fi
|
||||
if [ -z "${BOT_PASS_GENIUS}" ]; then
|
||||
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
|
||||
fi
|
||||
export SEEDER_PASS="$(read_secret seeder-pass)"
|
||||
|
||||
export CHAT_API_KEY="$(read_secret chat-matrix)"
|
||||
|
||||
@ -66,7 +66,7 @@ spec:
|
||||
- name: SEEDER_USER
|
||||
value: othrys-seeder
|
||||
- name: BOT_USER
|
||||
value: atlasbot
|
||||
value: atlas-smart
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
|
||||
@ -29,12 +29,18 @@ spec:
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 50
|
||||
- weight: 80
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
values: ["rpi5"]
|
||||
- weight: 60
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
|
||||
containers:
|
||||
- name: monerod
|
||||
image: registry.bstein.dev/crypto/monerod:0.18.4.1
|
||||
|
||||
@ -23,7 +23,7 @@ spec:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
values: ["rpi5"]
|
||||
containers:
|
||||
- name: xmrig
|
||||
image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
|
||||
|
||||
@ -123,13 +123,22 @@ spec:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4","rpi5"]
|
||||
- key: longhorn
|
||||
operator: NotIn
|
||||
values: ["true"]
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values: ["titan-13","titan-15","titan-17","titan-19"]
|
||||
- weight: 50
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
values: ["rpi5"]
|
||||
containers:
|
||||
- name: gitea
|
||||
image: gitea/gitea:1.23
|
||||
|
||||
@ -245,6 +245,17 @@ spec:
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-registry
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
|
||||
extraEnvVars:
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
|
||||
value: harbor-core
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
|
||||
value: http://harbor-registry:8080/service/notifications
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
|
||||
value: 5s
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
|
||||
value: "5"
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
|
||||
value: 1s
|
||||
controller:
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-registryctl
|
||||
@ -263,6 +274,10 @@ spec:
|
||||
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
|
||||
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
|
||||
{{ end }}
|
||||
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
|
||||
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
|
||||
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
|
||||
{{ end }}
|
||||
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
|
||||
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
|
||||
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
|
||||
@ -397,10 +412,10 @@ spec:
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/rules/0/http/paths/2/backend/service/name
|
||||
value: harbor-registry
|
||||
value: harbor-core
|
||||
- op: replace
|
||||
path: /spec/rules/0/http/paths/2/backend/service/port/number
|
||||
value: 5000
|
||||
value: 80
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: harbor-jobservice
|
||||
@ -464,6 +479,16 @@ spec:
|
||||
value: /vault/secrets/harbor-registry-env.sh
|
||||
- name: VAULT_COPY_FILES
|
||||
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
|
||||
value: harbor-core
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
|
||||
value: http://harbor-registry:8080/service/notifications
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
|
||||
value: 5s
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
|
||||
value: "5"
|
||||
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
|
||||
value: 1s
|
||||
envFrom:
|
||||
- $patch: replace
|
||||
volumeMounts:
|
||||
|
||||
@ -67,7 +67,7 @@ data:
|
||||
url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -108,7 +108,7 @@ data:
|
||||
url('https://scm.bstein.dev/bstein/ci-demo.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/master')
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
@ -167,6 +167,58 @@ data:
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('atlasbot') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/2 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/atlasbot.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('Soteria') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
triggers {
|
||||
scmTrigger {
|
||||
scmpoll_spec('H/5 * * * *')
|
||||
ignorePostCommitHooks(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
definition {
|
||||
cpsScm {
|
||||
scm {
|
||||
git {
|
||||
remote {
|
||||
url('https://scm.bstein.dev/bstein/soteria.git')
|
||||
credentials('gitea-pat')
|
||||
}
|
||||
branches('*/main')
|
||||
}
|
||||
}
|
||||
scriptPath('Jenkinsfile')
|
||||
}
|
||||
}
|
||||
}
|
||||
pipelineJob('data-prepper') {
|
||||
properties {
|
||||
pipelineTriggers {
|
||||
|
||||
@ -48,7 +48,7 @@ spec:
|
||||
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
|
||||
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
|
||||
{{ end }}
|
||||
bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
|
||||
bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
|
||||
spec:
|
||||
serviceAccountName: jenkins
|
||||
nodeSelector:
|
||||
|
||||
13
services/jenkins/dind-pvc.yaml
Normal file
13
services/jenkins/dind-pvc.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
# services/jenkins/dind-pvc.yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: jenkins-dind-cache
|
||||
namespace: jenkins
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
storageClassName: astreae
|
||||
@ -8,6 +8,7 @@ resources:
|
||||
- vault-serviceaccount.yaml
|
||||
- pvc.yaml
|
||||
- cache-pvc.yaml
|
||||
- dind-pvc.yaml
|
||||
- plugins-pvc.yaml
|
||||
- configmap-jcasc.yaml
|
||||
- configmap-plugins.yaml
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
|
||||
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
|
||||
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
|
||||
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
|
||||
# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: keycloak-portal-e2e-execute-actions-email-14
|
||||
name: keycloak-portal-e2e-execute-actions-email-18
|
||||
namespace: sso
|
||||
spec:
|
||||
suspend: true
|
||||
@ -70,7 +70,7 @@ spec:
|
||||
- name: E2E_PROBE_USERNAME
|
||||
value: robotuser
|
||||
- name: E2E_PROBE_EMAIL
|
||||
value: robotuser@bstein.dev
|
||||
value: brad.stein+robot@gmail.com
|
||||
- name: EXECUTE_ACTIONS_CLIENT_ID
|
||||
value: bstein-dev-home
|
||||
- name: EXECUTE_ACTIONS_REDIRECT_URI
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# services/keycloak/oneoffs/realm-settings-job.yaml
|
||||
# One-off job for sso/keycloak-realm-settings-36.
|
||||
# Purpose: keycloak realm settings 36 (see container args/env in this file).
|
||||
# One-off job for sso/keycloak-realm-settings-38.
|
||||
# Purpose: keycloak realm settings 38 (see container args/env in this file).
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
# Safe to delete the finished Job/pod; it should not run continuously.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: keycloak-realm-settings-36
|
||||
name: keycloak-realm-settings-38
|
||||
namespace: sso
|
||||
spec:
|
||||
suspend: true
|
||||
@ -64,7 +64,7 @@ spec:
|
||||
- name: KEYCLOAK_REALM
|
||||
value: atlas
|
||||
- name: KEYCLOAK_SMTP_HOST
|
||||
value: mail.bstein.dev
|
||||
value: smtp.postmarkapp.com
|
||||
- name: KEYCLOAK_SMTP_PORT
|
||||
value: "587"
|
||||
- name: KEYCLOAK_SMTP_FROM
|
||||
|
||||
@ -18,6 +18,7 @@ spec:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
maintenance.bstein.dev/restart-rev: "20260207-2"
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "maintenance"
|
||||
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
|
||||
@ -105,7 +106,7 @@ spec:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: ariadne
|
||||
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
|
||||
image: registry.bstein.dev/bstein/ariadne:latest
|
||||
imagePullPolicy: Always
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
@ -285,7 +286,7 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_MAILU_SYNC
|
||||
value: "30 4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
||||
@ -293,11 +294,11 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
||||
value: "15 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
|
||||
value: "0 6 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
|
||||
value: "0 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_POD_CLEANER
|
||||
@ -305,11 +306,11 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
|
||||
value: "23 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
||||
value: "30 4 * * 0"
|
||||
value: "30 4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
||||
@ -330,6 +331,8 @@ spec:
|
||||
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
|
||||
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
|
||||
value: "5"
|
||||
- name: ARIADNE_ALERTMANAGER_URL
|
||||
value: http://alertmanager.monitoring.svc.cluster.local
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
- name: OPENSEARCH_LIMIT_BYTES
|
||||
|
||||
@ -29,6 +29,29 @@ rules:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- deployments
|
||||
- statefulsets
|
||||
- daemonsets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: ["longhorn.io"]
|
||||
resources:
|
||||
- volumes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods/exec
|
||||
@ -56,3 +79,17 @@ roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: ariadne-job-spawner
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: ariadne-auth-delegator
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ariadne
|
||||
namespace: maintenance
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:auth-delegator
|
||||
|
||||
@ -21,3 +21,26 @@ spec:
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImageRepository
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
image: registry.bstein.dev/bstein/soteria
|
||||
interval: 1m0s
|
||||
secretRef:
|
||||
name: harbor-regcred
|
||||
---
|
||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
||||
kind: ImagePolicy
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
imageRepositoryRef:
|
||||
name: soteria
|
||||
policy:
|
||||
semver:
|
||||
range: ">=0.1.0-0"
|
||||
|
||||
@ -5,6 +5,7 @@ resources:
|
||||
- namespace.yaml
|
||||
- image.yaml
|
||||
- secretproviderclass.yaml
|
||||
- soteria-configmap.yaml
|
||||
- vault-serviceaccount.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- ariadne-serviceaccount.yaml
|
||||
@ -13,9 +14,12 @@ resources:
|
||||
- k3s-traefik-cleanup-rbac.yaml
|
||||
- node-nofile-serviceaccount.yaml
|
||||
- pod-cleaner-rbac.yaml
|
||||
- soteria-serviceaccount.yaml
|
||||
- soteria-rbac.yaml
|
||||
- ariadne-deployment.yaml
|
||||
- oneoffs/ariadne-migrate-job.yaml
|
||||
- ariadne-service.yaml
|
||||
- soteria-deployment.yaml
|
||||
- disable-k3s-traefik-daemonset.yaml
|
||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||
- node-nofile-daemonset.yaml
|
||||
@ -24,9 +28,12 @@ resources:
|
||||
- node-image-sweeper-serviceaccount.yaml
|
||||
- node-image-sweeper-daemonset.yaml
|
||||
- image-sweeper-cronjob.yaml
|
||||
- soteria-service.yaml
|
||||
images:
|
||||
- name: registry.bstein.dev/bstein/ariadne
|
||||
newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||
- name: registry.bstein.dev/bstein/soteria
|
||||
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
|
||||
configMapGenerator:
|
||||
- name: disable-k3s-traefik-script
|
||||
namespace: maintenance
|
||||
|
||||
10
services/maintenance/soteria-configmap.yaml
Normal file
10
services/maintenance/soteria-configmap.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# services/maintenance/soteria-configmap.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
data:
|
||||
SOTERIA_BACKUP_DRIVER: "longhorn"
|
||||
SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
|
||||
SOTERIA_LONGHORN_BACKUP_MODE: "incremental"
|
||||
73
services/maintenance/soteria-deployment.yaml
Normal file
73
services/maintenance/soteria-deployment.yaml
Normal file
@ -0,0 +1,73 @@
|
||||
# services/maintenance/soteria-deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: soteria
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: soteria
|
||||
spec:
|
||||
serviceAccountName: soteria
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 90
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi5"]
|
||||
- weight: 50
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values: ["rpi4"]
|
||||
containers:
|
||||
- name: soteria
|
||||
image: registry.bstein.dev/bstein/soteria:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: soteria
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 2
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: http
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 2
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
22
services/maintenance/soteria-rbac.yaml
Normal file
22
services/maintenance/soteria-rbac.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
# services/maintenance/soteria-rbac.yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: soteria
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumeclaims", "persistentvolumes"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: soteria
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: soteria
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
14
services/maintenance/soteria-service.yaml
Normal file
14
services/maintenance/soteria-service.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
# services/maintenance/soteria-service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: soteria
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
8
services/maintenance/soteria-serviceaccount.yaml
Normal file
8
services/maintenance/soteria-serviceaccount.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
# services/maintenance/soteria-serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: soteria
|
||||
namespace: maintenance
|
||||
imagePullSecrets:
|
||||
- name: harbor-regcred
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -89,7 +89,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -1901,7 +1901,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -145,7 +145,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||
legendFormat: '{{instance}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -286,8 +286,8 @@ data:
|
||||
summary: "node-image-sweeper not fully ready"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (>3h since success)"
|
||||
- uid: maint-ariadne-image-sweeper-stale
|
||||
title: "Ariadne image sweeper stale (schedule >8d)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
@ -297,10 +297,10 @@ data:
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
||||
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{cronjob}}'
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
@ -321,17 +321,166 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [10800]
|
||||
params: [691200]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Maintenance cronjob stale >3h since last success"
|
||||
summary: "Ariadne image sweeper stale >8d since last success"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (legacy disabled)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: vector(0)
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: legacy
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: "Legacy cronjob alert disabled"
|
||||
labels:
|
||||
severity: info
|
||||
- orgId: 1
|
||||
name: ariadne
|
||||
folder: Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: ariadne-schedule-error
|
||||
title: "Ariadne schedule task failed"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: ariadne-scheduler-stalled
|
||||
title: "Ariadne scheduler behind (>15m)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [900]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne scheduler behind for {{ $labels.task }}"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
@ -352,7 +501,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
|
||||
expr: postmark_outbound_bounce_rate{window="1d"}
|
||||
legendFormat: bounce 1d
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -400,7 +549,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: POSTMARK_API_UP
|
||||
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
|
||||
legendFormat: api up
|
||||
datasource:
|
||||
type: prometheus
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -98,7 +98,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -1910,7 +1910,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
|
||||
@ -286,7 +286,7 @@ spec:
|
||||
podAnnotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "monitoring"
|
||||
monitoring.bstein.dev/restart-rev: "1"
|
||||
monitoring.bstein.dev/restart-rev: "4"
|
||||
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
|
||||
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
|
||||
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
|
||||
|
||||
@ -43,6 +43,12 @@ spec:
|
||||
value: /var/run/secrets/vault-token-reviewer/token
|
||||
- name: VAULT_K8S_ROLE_TTL
|
||||
value: 1h
|
||||
- name: VAULT_K8S_BOUND_AUDIENCES
|
||||
value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
|
||||
- name: VAULT_K8S_ISSUER
|
||||
value: https://kubernetes.default.svc.cluster.local
|
||||
- name: VAULT_K8S_DISABLE_ISS_VALIDATION
|
||||
value: "false"
|
||||
volumeMounts:
|
||||
- name: k8s-auth-config-script
|
||||
mountPath: /scripts
|
||||
|
||||
@ -53,6 +53,8 @@ ensure_token
|
||||
k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
|
||||
k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
|
||||
k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
||||
k8s_issuer="${VAULT_K8S_ISSUER:-}"
|
||||
disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
|
||||
role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
|
||||
token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
|
||||
|
||||
@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
|
||||
vault_cmd auth enable kubernetes
|
||||
fi
|
||||
|
||||
ensure_default_policy_login() {
|
||||
default_policy="$(vault_cmd policy read default)"
|
||||
if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
|
||||
return
|
||||
fi
|
||||
log "updating default policy to allow kubernetes login"
|
||||
default_policy="${default_policy}
|
||||
path \"auth/kubernetes/login\" {
|
||||
capabilities = [\"create\", \"update\"]
|
||||
}
|
||||
"
|
||||
printf '%s\n' "${default_policy}" | vault_cmd policy write default -
|
||||
}
|
||||
|
||||
log "configuring kubernetes auth"
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
if [ -n "${k8s_issuer}" ]; then
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
token_reviewer_jwt="${token_reviewer_jwt}" \
|
||||
kubernetes_host="${k8s_host}" \
|
||||
kubernetes_ca_cert="${k8s_ca}" \
|
||||
issuer="${k8s_issuer}" \
|
||||
disable_iss_validation="${disable_iss_validation}"
|
||||
else
|
||||
vault_cmd write auth/kubernetes/config \
|
||||
token_reviewer_jwt="${token_reviewer_jwt}" \
|
||||
kubernetes_host="${k8s_host}" \
|
||||
kubernetes_ca_cert="${k8s_ca}"
|
||||
fi
|
||||
|
||||
ensure_default_policy_login
|
||||
|
||||
write_raw_policy() {
|
||||
name="$1"
|
||||
@ -87,6 +114,7 @@ write_policy_and_role() {
|
||||
service_accounts="$3"
|
||||
read_paths="$4"
|
||||
write_paths="$5"
|
||||
audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
|
||||
|
||||
policy_body=""
|
||||
for path in ${read_paths}; do
|
||||
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
|
||||
}
|
||||
"
|
||||
done
|
||||
if [ "${role}" = "maintenance" ]; then
|
||||
policy_body="${policy_body}
|
||||
path \"sys/auth\" {
|
||||
capabilities = [\"read\"]
|
||||
}
|
||||
path \"sys/auth/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
|
||||
}
|
||||
path \"auth/kubernetes/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
path \"auth/oidc/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
path \"sys/policies/acl\" {
|
||||
capabilities = [\"list\"]
|
||||
}
|
||||
path \"sys/policies/acl/*\" {
|
||||
capabilities = [\"create\", \"update\", \"read\"]
|
||||
}
|
||||
"
|
||||
fi
|
||||
|
||||
log "writing policy ${role}"
|
||||
printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
|
||||
|
||||
log "writing role ${role}"
|
||||
if [ -n "${audiences}" ]; then
|
||||
vault_cmd write "auth/kubernetes/role/${role}" \
|
||||
bound_service_account_audiences="${audiences}" \
|
||||
bound_service_account_names="${service_accounts}" \
|
||||
bound_service_account_namespaces="${namespace}" \
|
||||
policies="${role}" \
|
||||
ttl="${role_ttl}"
|
||||
return
|
||||
fi
|
||||
vault_cmd write "auth/kubernetes/role/${role}" \
|
||||
bound_service_account_names="${service_accounts}" \
|
||||
bound_service_account_namespaces="${namespace}" \
|
||||
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
|
||||
"nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
|
||||
write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
|
||||
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
|
||||
write_policy_and_role "ai" "ai" "atlasbot" \
|
||||
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
|
||||
write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
|
||||
"jenkins/* shared/harbor-pull" ""
|
||||
write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
|
||||
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
|
||||
write_policy_and_role "health" "health" "health-vault-sync" \
|
||||
"health/*" ""
|
||||
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
|
||||
"maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
|
||||
"maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
|
||||
write_policy_and_role "finance" "finance" "finance-vault" \
|
||||
"finance/* shared/postmark-relay" ""
|
||||
write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user