Compare commits

...

414 Commits

Author SHA1 Message Date
6e4cafa3df maintenance: harden metis recovery and fix harbor rollout 2026-03-31 14:51:49 -03:00
41021c472b maintenance/jenkins: align Metis ingress, sentinel push, and CI job 2026-03-31 14:21:53 -03:00
17afb0bb55 maintenance: add Metis service and sentinel manifests 2026-03-31 14:07:17 -03:00
1e0e73a28f monitoring: combine Ariadne and Metis tests 2026-03-31 13:54:04 -03:00
af01a620c3 monitoring: roll grafana to apply latest alert rules 2026-03-30 18:41:21 -03:00
0edc513e2e monitoring: raise rootfs warning threshold to 85 percent 2026-03-30 18:40:59 -03:00
3659c9c07b maintenance: unblock sweeper rollouts on degraded nodes 2026-03-30 18:39:05 -03:00
11d58dccb7 maintenance: run image sweeper periodically for sd safety 2026-03-30 18:36:25 -03:00
5bcff5f405 monitoring: tame email noise and harden postmark alerts 2026-03-30 18:32:22 -03:00
f5dcea860e atlasbot: wire context and timeout fallbacks 2026-03-30 16:55:19 -03:00
a1e90f4600 atlasbot: wire quick smart genius modes 2026-03-30 16:51:23 -03:00
f04f032721 longhorn: avoid webhook deadlock and forced image pulls 2026-03-30 10:16:42 -03:00
083999c84c comms: harden matrix auth ingress routes for MAS 2026-03-30 08:21:19 -03:00
dc62a84e2e flux: keep feature branch tracking until main push is available 2026-03-30 07:57:13 -03:00
31ffaedf2a flux: target main branch for sync and image automation 2026-03-30 07:48:47 -03:00
b2d1dc4e3f flux: return sync and image automation branches to master 2026-03-30 07:48:09 -03:00
flux-bot
271a941d89 chore(atlasbot): automated image update 2026-03-30 10:47:00 +00:00
flux-bot
fa30a2cade chore(atlasbot): automated image update 2026-03-30 07:10:35 +00:00
f71d0bc3f3 atlasbot: switch quick mode to 7b fast model 2026-03-30 04:07:08 -03:00
flux-bot
19a3207eac chore(atlasbot): automated image update 2026-03-30 07:04:35 +00:00
2d5107f7e2 bstein-dev-home: deploy backend image 0.1.1-123 2026-03-30 03:54:39 -03:00
a091ea75a3 atlasbot: deploy matrix timeout fix image 0.1.2-103 2026-03-30 03:51:30 -03:00
95dabf5df8 atlasbot: disable ollama retries for strict quick budgets 2026-03-30 03:50:59 -03:00
flux-bot
311cec8adf chore(bstein-dev-home): automated image update 2026-03-30 06:46:11 +00:00
flux-bot
b18e355412 chore(atlasbot): automated image update 2026-03-30 06:45:32 +00:00
flux-bot
80057210fc chore(bstein-dev-home): automated image update 2026-03-30 06:38:10 +00:00
flux-bot
7a1e99a95e chore(bstein-dev-home): automated image update 2026-03-30 06:34:10 +00:00
flux-bot
ace86ad736 chore(bstein-dev-home): automated image update 2026-03-30 06:29:09 +00:00
flux-bot
2a4deb6dd1 chore(atlasbot): automated image update 2026-03-30 06:25:30 +00:00
flux-bot
eee5456921 chore(atlasbot): automated image update 2026-03-30 05:55:27 +00:00
f86d3a4c00 atlasbot: cap quick runtime and expose genius model to portal 2026-03-30 02:53:06 -03:00
a6b77c68f0 maintenance: grant ariadne auth-delegator 2026-02-08 09:55:20 -03:00
9599b4c975 ariadne: use vault-admin role for vault config 2026-02-07 22:34:10 -03:00
df96c06fa2 ariadne: run image sweeper daily 2026-02-07 11:11:41 -03:00
e575e6cb1e gitea: prefer rpi5 nodes 2026-02-07 11:07:02 -03:00
flux-bot
bca66c5d71 chore(maintenance): automated image update 2026-02-07 13:56:49 +00:00
b2affe091d maintenance: align vault role env 2026-02-07 10:51:20 -03:00
flux-bot
6c7f2112c2 chore(atlasbot): automated image update 2026-02-07 13:50:38 +00:00
a4874163ec infra: bias gitea/monerod placement, bump synapse ensure job 2026-02-07 10:48:48 -03:00
079f8efbb9 comms: run synapse admin ensure (admin flag) 2026-02-07 10:30:34 -03:00
95228b75ab comms: ensure synapse admin flag; ariadne vault role 2026-02-07 10:28:55 -03:00
9e75bf0b42 ariadne: accelerate schedules for alert clearing 2026-02-07 03:23:42 -03:00
b2841985ef comms: re-suspend synapse admin job 2026-02-07 03:19:42 -03:00
9553995ba5 comms: run synapse admin ensure 2026-02-07 03:16:44 -03:00
e840777668 vault: allow maintenance auth sync 2026-02-07 03:13:53 -03:00
718a1ca312 crypto: run xmrig only on rpi5 2026-02-06 23:34:31 -03:00
55f0347b70 comms: suspend synapse admin ensure 2026-02-06 20:21:01 -03:00
f77e13b2cb comms: run synapse admin ensure with python image 2026-02-06 20:13:02 -03:00
fd2b10d00d comms: run synapse admin ensure 2026-02-06 20:01:38 -03:00
4209299a40 jenkins: add dind cache pvc 2026-02-06 20:00:01 -03:00
1804ff06c6 gitea: avoid longhorn nodes 2026-02-06 19:33:55 -03:00
4b5913827d maintenance: pivot soteria to longhorn 2026-02-06 18:38:29 -03:00
80548a2e82 longhorn: add b2 backup target 2026-02-06 18:28:37 -03:00
flux-bot
29756b1e62 chore(maintenance): automated image update 2026-02-06 21:27:42 +00:00
4bc91c40f6 maintenance: restore soteria job node selector 2026-02-06 04:19:36 -03:00
1260d18cdf maintenance: pin soteria jobs to titan-24 for backup 2026-02-06 04:15:58 -03:00
47efd0be06 maintenance: pin soteria jobs to arm64 workers 2026-02-06 04:10:55 -03:00
flux-bot
fa410c8f1e chore(maintenance): automated image update 2026-02-06 07:10:04 +00:00
0ed75718c2 maintenance: remove restic init job 2026-02-06 03:50:30 -03:00
50ff59a33b maintenance: add restic init job 2026-02-06 03:48:45 -03:00
flux-bot
9d9bcd1988 chore(maintenance): automated image update 2026-02-05 18:56:27 +00:00
flux-bot
c96749bab6 chore(maintenance): automated image update 2026-02-05 18:45:20 +00:00
5e239accbd maintenance: schedule soteria on rpi workers 2026-02-05 15:30:09 -03:00
flux-bot
c50298c8fe chore(bstein-dev-home): automated image update 2026-02-05 18:24:54 +00:00
flux-bot
3fcab34b7d chore(maintenance): automated image update 2026-02-05 18:24:44 +00:00
e223ef8e76 harbor: route registry traffic via core 2026-02-05 15:23:42 -03:00
7f72683242 harbor: wire registryctl notification auth 2026-02-05 15:17:54 -03:00
eeb8475848 harbor: fix registry notification URL 2026-02-05 15:00:43 -03:00
839b79696c harbor: restore registry notifications env 2026-02-05 14:50:53 -03:00
920f146efb harbor: enable registry notifications 2026-02-05 14:44:09 -03:00
flux-bot
c2c5474bc8 chore(atlasbot): automated image update 2026-02-05 17:38:26 +00:00
flux-bot
eab7ed5cff chore(maintenance): automated image update 2026-02-05 17:04:24 +00:00
flux-bot
22eb1a1159 chore(maintenance): automated image update 2026-02-05 16:32:49 +00:00
d7c1ecd098 maintenance: move soteria image to bstein 2026-02-05 13:12:03 -03:00
flux-bot
96288c9fdd chore(atlasbot): automated image update 2026-02-05 15:58:19 +00:00
flux-bot
a71bf7d9d5 chore(atlasbot): automated image update 2026-02-05 01:26:05 +00:00
533baa6d0c atlasbot: set genius model env 2026-02-04 19:39:43 -03:00
flux-bot
cee353e305 chore(atlasbot): automated image update 2026-02-04 22:15:47 +00:00
flux-bot
436d24ea70 chore(atlasbot): automated image update 2026-02-04 21:45:45 +00:00
flux-bot
6fb80e37e8 chore(atlasbot): automated image update 2026-02-04 21:39:45 +00:00
flux-bot
132e73100f chore(atlasbot): automated image update 2026-02-04 19:08:32 +00:00
flux-bot
fe8cc40903 chore(atlasbot): automated image update 2026-02-04 18:09:26 +00:00
flux-bot
947a43e630 chore(atlasbot): automated image update 2026-02-04 18:03:26 +00:00
flux-bot
31679b59f5 chore(atlasbot): automated image update 2026-02-04 17:56:26 +00:00
flux-bot
77b81e1e9a chore(atlasbot): automated image update 2026-02-04 17:49:23 +00:00
flux-bot
6523e45b3f chore(atlasbot): automated image update 2026-02-04 17:30:22 +00:00
flux-bot
49414c6cca chore(atlasbot): automated image update 2026-02-04 17:23:23 +00:00
flux-bot
6efa280e9d chore(atlasbot): automated image update 2026-02-04 17:20:23 +00:00
flux-bot
ff81cfdb82 chore(atlasbot): automated image update 2026-02-04 17:14:21 +00:00
flux-bot
c4b0250321 chore(atlasbot): automated image update 2026-02-04 17:07:21 +00:00
flux-bot
c1a8aa43d6 chore(atlasbot): automated image update 2026-02-04 17:00:21 +00:00
flux-bot
0275adb5b7 chore(atlasbot): automated image update 2026-02-04 16:53:20 +00:00
flux-bot
663143660b chore(atlasbot): automated image update 2026-02-04 16:45:19 +00:00
flux-bot
cb25cf7571 chore(atlasbot): automated image update 2026-02-04 16:39:18 +00:00
flux-bot
33127dde26 chore(atlasbot): automated image update 2026-02-04 14:03:05 +00:00
flux-bot
dc214cee79 chore(atlasbot): automated image update 2026-02-04 03:27:09 +00:00
flux-bot
4395986b0c chore(atlasbot): automated image update 2026-02-04 03:01:07 +00:00
flux-bot
fba7fe9029 chore(atlasbot): automated image update 2026-02-04 02:54:06 +00:00
flux-bot
8ecc8dd548 chore(atlasbot): automated image update 2026-02-04 02:42:05 +00:00
flux-bot
672a559e52 chore(atlasbot): automated image update 2026-02-04 02:30:04 +00:00
flux-bot
0dedf4083e chore(atlasbot): automated image update 2026-02-04 01:54:01 +00:00
flux-bot
bf8b99e365 chore(maintenance): automated image update 2026-02-04 01:51:59 +00:00
flux-bot
a33ad1c073 chore(atlasbot): automated image update 2026-02-04 01:27:59 +00:00
flux-bot
be90638fac chore(atlasbot): automated image update 2026-02-04 01:09:57 +00:00
flux-bot
3bc6d29f54 chore(atlasbot): automated image update 2026-02-04 00:55:56 +00:00
flux-bot
4e88c55e57 chore(atlasbot): automated image update 2026-02-04 00:42:56 +00:00
flux-bot
b8c94d5870 chore(atlasbot): automated image update 2026-02-04 00:37:55 +00:00
flux-bot
7f83d2f936 chore(atlasbot): automated image update 2026-02-04 00:34:55 +00:00
flux-bot
d42aa42d8a chore(atlasbot): automated image update 2026-02-04 00:19:53 +00:00
flux-bot
86f512fa1a chore(atlasbot): automated image update 2026-02-03 22:41:45 +00:00
flux-bot
16e2b19ea9 chore(atlasbot): automated image update 2026-02-03 22:06:41 +00:00
flux-bot
a1cb07c6d6 chore(atlasbot): automated image update 2026-02-03 20:18:32 +00:00
flux-bot
558d24ad6b chore(atlasbot): automated image update 2026-02-03 19:56:31 +00:00
flux-bot
160218a4ae chore(atlasbot): automated image update 2026-02-03 19:29:28 +00:00
flux-bot
2e361e620e chore(atlasbot): automated image update 2026-02-03 18:04:21 +00:00
flux-bot
fcd0ea9872 chore(atlasbot): automated image update 2026-02-03 17:53:20 +00:00
flux-bot
75826b0e5e chore(atlasbot): automated image update 2026-02-03 17:42:19 +00:00
flux-bot
71ddd03899 chore(atlasbot): automated image update 2026-02-03 17:34:18 +00:00
flux-bot
2d3a0b0184 chore(atlasbot): automated image update 2026-02-03 17:16:17 +00:00
flux-bot
c7fb848a62 chore(atlasbot): automated image update 2026-02-03 15:15:07 +00:00
flux-bot
c643c965b8 chore(atlasbot): automated image update 2026-02-03 15:05:06 +00:00
flux-bot
618be5ce01 chore(atlasbot): automated image update 2026-02-03 14:57:06 +00:00
flux-bot
ac049e6bb9 chore(atlasbot): automated image update 2026-02-03 14:51:05 +00:00
flux-bot
50108afc57 chore(atlasbot): automated image update 2026-02-03 14:40:04 +00:00
flux-bot
1f74a29445 chore(atlasbot): automated image update 2026-02-03 14:15:01 +00:00
flux-bot
08bc5f4b82 chore(atlasbot): automated image update 2026-02-03 14:07:01 +00:00
flux-bot
c208314506 chore(atlasbot): automated image update 2026-02-03 13:43:59 +00:00
flux-bot
763e5ff9e9 chore(atlasbot): automated image update 2026-02-03 13:22:57 +00:00
flux-bot
5ecb42cfef chore(atlasbot): automated image update 2026-02-03 13:08:56 +00:00
flux-bot
102d8e56ff chore(atlasbot): automated image update 2026-02-03 13:04:56 +00:00
flux-bot
ac96c5482f chore(atlasbot): automated image update 2026-02-03 12:56:55 +00:00
flux-bot
71aa60c696 chore(atlasbot): automated image update 2026-02-03 12:32:53 +00:00
flux-bot
d7582da21b chore(atlasbot): automated image update 2026-02-03 07:33:28 +00:00
flux-bot
4bf3773eb3 chore(atlasbot): automated image update 2026-02-03 06:31:22 +00:00
flux-bot
895ea49dc5 chore(atlasbot): automated image update 2026-02-03 06:07:21 +00:00
flux-bot
f355f6dd6a chore(atlasbot): automated image update 2026-02-03 04:57:14 +00:00
9f87e61f4a atlasbot: raise llm call caps 2026-02-03 01:55:21 -03:00
flux-bot
9a2890c45c chore(atlasbot): automated image update 2026-02-03 03:29:07 +00:00
flux-bot
ad74a45e76 chore(atlasbot): automated image update 2026-02-03 03:26:07 +00:00
fda4860d67 jenkins(atlasbot): set main branch 2026-02-02 23:12:13 -03:00
9f8a0f94d2 jenkins(atlasbot): use main branch 2026-02-02 23:10:42 -03:00
51d12791ca jenkins(atlasbot): track main branch 2026-02-02 22:25:56 -03:00
9fb36f23cd ci(atlasbot): add Jenkins job and image automation 2026-02-02 20:25:47 -03:00
flux-bot
1a2fe05808 chore(atlasbot): automated image update 2026-02-02 21:04:06 +00:00
flux-bot
0c5ec895ee chore(atlasbot): automated image update 2026-02-02 20:22:02 +00:00
7c87e177e9 vault: add default k8s audience 2026-02-02 17:15:35 -03:00
flux-bot
5e6d2a938f chore(atlasbot): automated image update 2026-02-02 20:08:02 +00:00
flux-bot
09070c2cc6 chore(atlasbot): automated image update 2026-02-02 19:53:00 +00:00
flux-bot
5dd30d8802 chore(atlasbot): automated image update 2026-02-02 18:13:52 +00:00
flux-bot
f302cb2448 chore(atlasbot): automated image update 2026-02-02 18:04:51 +00:00
c0a231fd91 atlasbot: bump image to 0.1.0-133 2026-02-02 14:58:38 -03:00
flux-bot
87f8a6d2c0 chore(atlasbot): automated image update 2026-02-02 17:56:53 +00:00
flux-bot
78a0867215 chore(atlasbot): automated image update 2026-02-02 17:56:48 +00:00
b0da9080c7 atlasbot: bump image to 0.1.0-132 2026-02-02 14:56:24 -03:00
8e3feeeaac atlasbot: bump image to 0.1.0-131 2026-02-02 14:54:36 -03:00
6f2ecdb364 atlasbot: bump image to 0.1.0-130 2026-02-02 14:48:34 -03:00
a5e168e55f atlasbot: bump image to 0.1.0-129 2026-02-02 14:41:22 -03:00
flux-bot
87dc1209b1 chore(atlasbot): automated image update 2026-02-02 17:32:49 +00:00
f86845053e atlasbot: disable queue for testing 2026-02-02 14:24:09 -03:00
flux-bot
c04c5ab048 chore(atlasbot): automated image update 2026-02-02 17:13:47 +00:00
flux-bot
ec3bdb7225 chore(atlasbot): automated image update 2026-02-02 16:55:46 +00:00
flux-bot
4b68809bb9 chore(atlasbot): automated image update 2026-02-02 16:45:45 +00:00
flux-bot
661bc6ac7d chore(atlasbot): automated image update 2026-02-02 16:38:44 +00:00
a9ee943344 atlasbot: bump image to 0.1.0-123 2026-02-02 13:30:34 -03:00
826df7d960 atlasbot: bump image to 0.1.0-122 2026-02-02 13:21:28 -03:00
flux-bot
8dfe124212 chore(atlasbot): automated image update 2026-02-02 16:10:42 +00:00
flux-bot
a3bef857f9 chore(atlasbot): automated image update 2026-02-02 15:57:41 +00:00
flux-bot
ed766d7a02 chore(atlasbot): automated image update 2026-02-02 15:47:40 +00:00
4295913056 atlasbot: bump image to 0.1.0-118 2026-02-02 12:39:24 -03:00
flux-bot
e3dfa2c0ea chore(atlasbot): automated image update 2026-02-02 15:20:38 +00:00
flux-bot
6bf8181677 chore(atlasbot): automated image update 2026-02-02 15:17:37 +00:00
d67f3d6fca jenkins: reload jcasc for soteria 2026-02-02 12:11:07 -03:00
flux-bot
41a0363fbc chore(atlasbot): automated image update 2026-02-02 15:09:37 +00:00
a609e230f2 atlasbot: bump image to 0.1.0-114 2026-02-02 12:05:58 -03:00
flux-bot
37342bfe4a chore(atlasbot): automated image update 2026-02-02 15:01:36 +00:00
a509354067 atlasbot: bump image to 0.1.0-112 2026-02-02 11:52:59 -03:00
flux-bot
fb14516674 chore(atlasbot): automated image update 2026-02-02 14:49:35 +00:00
60c80cc86f atlasbot: bump image to 0.1.0-110 2026-02-02 11:42:03 -03:00
flux-bot
7b8ea36554 chore(atlasbot): automated image update 2026-02-02 14:36:35 +00:00
49224375a0 atlasbot: bump image to 0.1.0-108 2026-02-02 11:23:53 -03:00
7d7ddd52dc atlasbot: bump image to 0.1.0-107 2026-02-02 11:14:54 -03:00
cd7043c7f1 jenkins: add soteria pipeline job 2026-02-02 11:01:22 -03:00
fb82a038e9 atlasbot: bump image to 0.1.0-106 2026-02-02 11:00:18 -03:00
93bcea5893 add ai harbor regcred sync 2026-02-02 10:08:46 -03:00
0ba8578416 bump atlasbot image 2026-02-02 10:05:06 -03:00
86475b8bdf track atlasbot knowledge index 2026-02-02 09:48:40 -03:00
f19eaf3b6b move atlasbot to ai namespace 2026-02-02 09:46:50 -03:00
flux-bot
e537180f1f chore(comms): automated image update 2026-02-02 06:03:16 +00:00
flux-bot
8298ed5c16 chore(comms): automated image update 2026-02-02 05:59:16 +00:00
flux-bot
152a28bd09 chore(comms): automated image update 2026-02-02 05:59:04 +00:00
7e02cccbe8 comms: bump atlasbot to 0.1.0-103 2026-02-02 02:58:44 -03:00
flux-bot
e60b1594c0 chore(comms): automated image update 2026-02-02 05:49:15 +00:00
flux-bot
87b2b37918 chore(comms): automated image update 2026-02-02 05:46:15 +00:00
flux-bot
a1249b3e00 chore(comms): automated image update 2026-02-02 05:45:54 +00:00
5000d1f76b comms: bump atlasbot to 0.1.0-101 2026-02-02 02:45:33 -03:00
flux-bot
584625b893 chore(comms): automated image update 2026-02-02 05:39:14 +00:00
95f4ecc4e0 comms: bump atlasbot to 0.1.0-99 2026-02-02 02:16:31 -03:00
240e04f9a2 comms: bump atlasbot to 0.1.0-98 2026-02-02 02:09:57 -03:00
449b8fed64 comms: bump atlasbot to 0.1.0-97 2026-02-02 02:03:50 -03:00
flux-bot
f6d655bb0c chore(comms): automated image update 2026-02-02 05:02:11 +00:00
4fa1b6e84c comms: bump atlasbot to 0.1.0-96 2026-02-02 01:57:58 -03:00
168efd78f7 comms: bump atlasbot to 0.1.0-95 2026-02-02 01:54:41 -03:00
e0bd11fa57 comms: bump atlasbot to 0.1.0-94 2026-02-02 01:45:52 -03:00
3f43299c92 comms: bump atlasbot to 0.1.0-93 2026-02-02 01:38:59 -03:00
645790f404 comms: bump atlasbot to 0.1.0-92 2026-02-01 18:46:01 -03:00
f11f6a4e62 comms: bump atlasbot to 0.1.0-91 2026-02-01 18:42:00 -03:00
flux-bot
c559253a31 chore(comms): automated image update 2026-02-01 21:37:32 +00:00
flux-bot
a3619ce215 chore(comms): automated image update 2026-02-01 21:33:32 +00:00
flux-bot
398fb7b797 chore(comms): automated image update 2026-02-01 21:25:31 +00:00
b30e6af95d comms: bump atlasbot to 0.1.0-87 2026-02-01 18:05:00 -03:00
flux-bot
4fd79b4708 chore(comms): automated image update 2026-02-01 20:55:29 +00:00
f23da3aea5 comms: bump atlasbot to 0.1.0-85 2026-02-01 17:48:24 -03:00
flux-bot
d951ae5061 chore(comms): automated image update 2026-02-01 20:43:28 +00:00
dfe9916e91 comms: bump atlasbot to 0.1.0-83 2026-02-01 14:45:58 -03:00
flux-bot
036c758547 chore(comms): automated image update 2026-02-01 17:39:12 +00:00
382a6e49ee comms: bump atlasbot to 0.1.0-81 2026-02-01 14:34:43 -03:00
93e7449509 comms: bump atlasbot to 0.1.0-80 2026-02-01 14:28:34 -03:00
58d1c168ff comms: bump atlasbot to 0.1.0-79 2026-02-01 14:07:57 -03:00
flux-bot
889400cdbf chore(comms): automated image update 2026-02-01 15:41:02 +00:00
flux-bot
e06066a327 chore(comms): automated image update 2026-02-01 15:36:01 +00:00
138f8c4407 comms: bump atlasbot image 2026-02-01 12:25:31 -03:00
33569aff99 vault: fix k8s auth env indent 2026-02-01 12:20:04 -03:00
3e2f56da7d vault: set kubernetes issuer 2026-02-01 12:18:57 -03:00
flux-bot
0914ba3509 chore(comms): automated image update 2026-02-01 15:01:58 +00:00
flux-bot
865a979424 chore(comms): automated image update 2026-02-01 14:55:58 +00:00
flux-bot
5dfc3ed259 chore(comms): automated image update 2026-02-01 14:55:52 +00:00
b479364017 comms: bump atlasbot image 2026-02-01 11:55:26 -03:00
flux-bot
00d8f852a3 chore(comms): automated image update 2026-02-01 14:47:57 +00:00
flux-bot
2d7f744284 chore(comms): automated image update 2026-02-01 14:18:55 +00:00
5f1b1a6cd0 vault: set k8s auth audiences 2026-02-01 11:17:02 -03:00
flux-bot
e966961dbe chore(comms): automated image update 2026-02-01 13:58:53 +00:00
7ffb0aba5d atlasbot: bump to 0.1.0-70 2026-02-01 10:37:29 -03:00
flux-bot
e80a439725 chore(comms): automated image update 2026-02-01 08:40:26 +00:00
flux-bot
8a22825796 chore(comms): automated image update 2026-02-01 08:40:09 +00:00
1fabd4ce2f atlasbot: bump to 0.1.0-69 2026-02-01 05:39:54 -03:00
759ac5ef90 comms: bump atlasbot image 2026-02-01 05:31:07 -03:00
flux-bot
bc971cce92 chore(comms): automated image update 2026-02-01 08:23:24 +00:00
flux-bot
069f6b4983 chore(comms): automated image update 2026-02-01 08:18:24 +00:00
64cfd5180d comms: bump atlasbot image 2026-02-01 05:12:59 -03:00
flux-bot
8a087fb16d chore(comms): automated image update 2026-02-01 08:10:23 +00:00
flux-bot
652c3a28a3 chore(comms): automated image update 2026-02-01 07:55:22 +00:00
flux-bot
141c54ccf3 chore(comms): automated image update 2026-02-01 07:49:21 +00:00
flux-bot
0f8529c7c5 chore(comms): automated image update 2026-02-01 07:46:21 +00:00
flux-bot
dafba36768 chore(comms): automated image update 2026-02-01 07:38:20 +00:00
4d5e9552e3 comms: bump atlasbot to 0.1.0-59 2026-02-01 04:32:01 -03:00
ddf1d41fd3 comms: bump atlasbot to 0.1.0-58 2026-02-01 04:25:12 -03:00
flux-bot
49e630f7fd chore(comms): automated image update 2026-02-01 07:17:18 +00:00
flux-bot
b7a81d28d1 chore(comms): automated image update 2026-02-01 06:39:16 +00:00
109c00bc3c comms: bump atlasbot to 0.1.0-55 2026-02-01 02:08:54 -03:00
flux-bot
c9ad055b4c chore(comms): automated image update 2026-02-01 05:07:08 +00:00
10498c659b comms: bump atlasbot to 0.1.0-54 2026-02-01 01:51:26 -03:00
flux-bot
978bd8e595 chore(comms): automated image update 2026-02-01 04:51:06 +00:00
259552ac28 comms: bump atlasbot to 0.1.0-53 2026-02-01 01:39:09 -03:00
flux-bot
7f2ded5244 chore(comms): automated image update 2026-02-01 04:39:05 +00:00
e4c370b983 comms: bump atlasbot to 0.1.0-52 2026-02-01 01:29:30 -03:00
flux-bot
7dfc98b6d6 chore(comms): automated image update 2026-02-01 04:29:04 +00:00
cb60c64bce comms: bump atlasbot to 0.1.0-51 2026-02-01 01:15:18 -03:00
flux-bot
091f095893 chore(comms): automated image update 2026-02-01 04:15:03 +00:00
5b389d12df comms(atlasbot): bump image to 0.1.0-50 2026-01-31 22:30:04 -03:00
flux-bot
ae88bc8484 chore(comms): automated image update 2026-02-01 01:28:49 +00:00
529576e082 comms: bump atlasbot image 2026-01-31 21:40:11 -03:00
flux-bot
a7ffaa3213 chore(maintenance): automated image update 2026-02-01 00:39:49 +00:00
flux-bot
e478f1c74d chore(comms): automated image update 2026-02-01 00:39:45 +00:00
2480b6cecc comms: disable atlasbot queue for tests 2026-01-31 18:21:39 -03:00
bbe27f963d comms: bump atlasbot to 0.1.0-48 2026-01-31 18:14:55 -03:00
flux-bot
c5da854cef chore(comms): automated image update 2026-01-31 21:14:27 +00:00
0319707fff atlasbot: make node counts explicit 2026-01-31 16:44:50 -03:00
4f8d8f1f25 atlasbot: prioritize high-priority subquestions 2026-01-31 16:38:54 -03:00
5448ff3f55 atlasbot: expand chunk summaries 2026-01-31 16:35:02 -03:00
b6c2d1416e atlasbot: enable debug pipeline logging 2026-01-31 16:30:05 -03:00
flux-bot
152e1d88f4 chore(comms): automated image update 2026-01-31 19:29:18 +00:00
86e9dc289f atlasbot: bump to 0.1.0-43 2026-01-31 14:24:13 -03:00
flux-bot
c4b7198c46 chore(comms): automated image update 2026-01-31 17:21:08 +00:00
f8a12be2ec atlasbot: bump image to 0.1.0-42 2026-01-31 14:15:41 -03:00
flux-bot
c9ec5126cd chore(comms): automated image update 2026-01-31 17:15:07 +00:00
flux-bot
c66db7c18f chore(maintenance): automated image update 2026-01-31 16:42:06 +00:00
flux-bot
de47ab76a5 chore(maintenance): automated image update 2026-01-31 16:39:06 +00:00
c788512d59 atlasbot: bump image to 0.1.0-41 2026-01-31 13:26:44 -03:00
flux-bot
ae25ccb6f2 chore(comms): automated image update 2026-01-31 16:25:03 +00:00
flux-bot
e27f4cfc68 chore(comms): automated image update 2026-01-31 11:08:36 +00:00
50e06b4a13 atlasbot: bump image to 0.1.0-40 2026-01-31 08:08:21 -03:00
934d6e7a3b comms: fix atlasbot image indentation 2026-01-31 07:17:58 -03:00
flux-bot
25654a731e chore(comms): automated image update 2026-01-31 10:12:32 +00:00
4aecadb3de atlasbot: bump image to 0.1.0-39 2026-01-31 07:11:56 -03:00
3b79a82c71 atlasbot: bump image to 0.1.0-38 2026-01-31 06:18:58 -03:00
flux-bot
04b263dc2d chore(comms): automated image update 2026-01-31 09:18:28 +00:00
93841d9de7 maintenance: add soteria service 2026-01-31 03:35:39 -03:00
bb294c6d21 atlasbot: bump image to 0.1.0-37 2026-01-31 03:20:44 -03:00
flux-bot
64962f8863 chore(comms): automated image update 2026-01-31 06:20:12 +00:00
bcb4c05b14 ariadne: add alertmanager url 2026-01-30 21:57:05 -03:00
flux-bot
d00a09fb58 chore(maintenance): automated image update 2026-01-31 00:54:47 +00:00
flux-bot
a22ff047f7 chore(maintenance): automated image update 2026-01-31 00:40:46 +00:00
flux-bot
fef5d7d26a chore(maintenance): automated image update 2026-01-30 23:54:41 +00:00
fa60fa124c comms: suspend mas-local-users-ensure 2026-01-30 17:46:46 -03:00
30c1192978 comms: bump mas-local-users-ensure job 2026-01-30 17:44:42 -03:00
644be2c575 comms: bump comms-secrets-ensure job 2026-01-30 17:42:28 -03:00
29d1bf9f4e comms: run mas-local-users-ensure job (retry) 2026-01-30 17:37:42 -03:00
9bdab331b6 comms: suspend mas-local-users-ensure job 2026-01-30 17:33:55 -03:00
8f49ac2d63 comms: run mas-local-users-ensure job 2026-01-30 17:29:29 -03:00
flux-bot
43b9cd27ed chore(maintenance): automated image update 2026-01-30 20:18:24 +00:00
580ac4950b comms: add atlas-genius bot 2026-01-30 17:17:59 -03:00
flux-bot
d677e83423 chore(comms): automated image update 2026-01-30 20:07:20 +00:00
flux-bot
bff55a6dc7 chore(bstein-dev-home): automated image update 2026-01-30 20:05:30 +00:00
flux-bot
0465658ba7 chore(bstein-dev-home): automated image update 2026-01-30 20:02:30 +00:00
flux-bot
3e484ba726 chore(comms): automated image update 2026-01-30 19:53:19 +00:00
flux-bot
088bb3b435 chore(comms): automated image update 2026-01-30 19:42:22 +00:00
flux-bot
e81bad9d47 chore(maintenance): automated image update 2026-01-30 13:21:48 +00:00
3f11a065a3 atlasbot: support quick/smart Matrix accounts 2026-01-30 10:21:07 -03:00
flux-bot
ec6375f31d chore(maintenance): automated image update 2026-01-30 05:19:07 +00:00
flux-bot
5a8360ed97 chore(maintenance): automated image update 2026-01-30 03:15:56 +00:00
flux-bot
9e75f82d43 chore(comms): automated image update 2026-01-29 23:54:42 +00:00
flux-bot
7ac26eb0dd chore(maintenance): automated image update 2026-01-29 19:56:19 +00:00
00d2f6a61f comms: bump atlasbot to 0.1.0-32 2026-01-29 16:51:43 -03:00
flux-bot
687ca2c22d chore(comms): automated image update 2026-01-29 19:50:22 +00:00
52281ca2ec comms: bump atlasbot to 0.1.0-31 2026-01-29 16:09:15 -03:00
flux-bot
8850e9fdf1 chore(comms): automated image update 2026-01-29 19:08:18 +00:00
a253993451 comms: bump atlasbot to 0.1.0-30 2026-01-29 14:56:59 -03:00
flux-bot
aeff2bbe73 chore(comms): automated image update 2026-01-29 17:55:12 +00:00
39616b2435 comms: bump atlasbot 0.1.0-29 2026-01-29 14:18:51 -03:00
flux-bot
b3d8674499 chore(maintenance): automated image update 2026-01-29 16:43:04 +00:00
3ca0fb352d sso: suspend execute-actions email test job 2026-01-29 13:41:41 -03:00
f7ea7d57e9 sso: send execute-actions email to robotuser 2026-01-29 13:40:45 -03:00
flux-bot
a418844f61 chore(maintenance): automated image update 2026-01-29 16:35:03 +00:00
96d914d02c comms: bump atlasbot to 0.1.0-28 2026-01-29 13:33:39 -03:00
e6c031829a sso: suspend keycloak oneoff jobs 2026-01-29 13:30:10 -03:00
ebfb19c34e sso: rerun execute-actions email test 2026-01-29 13:28:32 -03:00
4fedec3999 sso: set keycloak smtp to postmark 2026-01-29 13:27:28 -03:00
55f78f2eb7 sso: rerun execute-actions email test 2026-01-29 13:23:59 -03:00
ab5ef933d8 sso: run keycloak execute-actions email test 2026-01-29 13:21:40 -03:00
3e23109229 sso: suspend realm settings job 2026-01-29 13:20:11 -03:00
d18c06ad31 sso: rerun keycloak realm settings 2026-01-29 13:10:31 -03:00
292a6b7e04 monitoring: stabilize alert queries 2026-01-29 13:07:55 -03:00
flux-bot
d7fd5682f3 chore(maintenance): automated image update 2026-01-29 16:07:01 +00:00
bedab04b22 atlasbot: bump to 0.1.0-27 2026-01-29 13:06:37 -03:00
6d7a32ce11 atlasbot: align to installed qwen model 2026-01-29 10:25:57 -03:00
87ded58aca atlasbot: align models and bump image 2026-01-29 10:17:38 -03:00
flux-bot
5f30ab73bf chore(comms): automated image update 2026-01-29 13:16:50 +00:00
flux-bot
3f2d2e5fdb chore(maintenance): automated image update 2026-01-29 13:16:46 +00:00
flux-bot
f55e9a6043 chore(comms): automated image update 2026-01-29 12:23:45 +00:00
flux-bot
7de15db57a chore(comms): automated image update 2026-01-29 11:47:42 +00:00
flux-bot
265f809f8f chore(maintenance): automated image update 2026-01-29 11:43:38 +00:00
flux-bot
e4d19fc5b4 chore(comms): automated image update 2026-01-29 11:42:41 +00:00
flux-bot
d10eace338 chore(maintenance): automated image update 2026-01-29 10:45:37 +00:00
78afc97db2 atlasbot: bump image and allow longhorn read 2026-01-29 07:45:24 -03:00
flux-bot
3c0d4d0f4f chore(comms): automated image update 2026-01-29 10:44:37 +00:00
flux-bot
d73d6d7c01 chore(comms): automated image update 2026-01-29 09:21:30 +00:00
flux-bot
af02ee7abf chore(comms): automated image update 2026-01-29 09:16:59 +00:00
630a596cb6 atlasbot: bump image tag 2026-01-29 06:16:43 -03:00
flux-bot
d2729138b6 chore(maintenance): automated image update 2026-01-29 09:12:26 +00:00
a6fbcc8669 maintenance(ariadne): allow apps/events, bump image tag 2026-01-29 06:09:36 -03:00
flux-bot
d91d632496 chore(maintenance): automated image update 2026-01-29 09:01:41 +00:00
flux-bot
3a9949a24d chore(comms): automated image update 2026-01-29 08:01:25 +00:00
b045506516 vault: allow kubernetes auth login 2026-01-29 02:22:51 -03:00
flux-bot
3f24de03d1 chore(maintenance): automated image update 2026-01-29 04:58:20 +00:00
flux-bot
a3ffcb2ea1 chore(comms): automated image update 2026-01-29 04:58:10 +00:00
flux-bot
314a922109 chore(comms): automated image update 2026-01-29 04:56:21 +00:00
flux-bot
2ed4762fab chore(maintenance): automated image update 2026-01-29 04:56:05 +00:00
1c6d572559 images: bump ariadne and atlasbot 2026-01-29 01:55:07 -03:00
flux-bot
58cc15a7e0 chore(comms): automated image update 2026-01-29 01:35:52 +00:00
flux-bot
3da28531fd chore(maintenance): automated image update 2026-01-29 01:35:03 +00:00
flux-bot
58f818cebc chore(maintenance): automated image update 2026-01-28 23:47:54 +00:00
flux-bot
cff7ec922e chore(comms): automated image update 2026-01-28 23:46:43 +00:00
flux-bot
a49f0580da chore(maintenance): automated image update 2026-01-28 23:43:54 +00:00
flux-bot
10d4f015b2 chore(maintenance): automated image update 2026-01-28 23:36:54 +00:00
flux-bot
669849b883 chore(maintenance): automated image update 2026-01-28 23:31:53 +00:00
flux-bot
9ce9470677 chore(comms): automated image update 2026-01-28 22:59:41 +00:00
c3555d59f7 monitoring: fix GPU share attribution 2026-01-28 19:08:53 -03:00
28af553498 monitoring: de-dupe ariadne schedule alert 2026-01-28 18:45:15 -03:00
d42385de3e comms: suspend synapse admin ensure job 2026-01-28 18:39:28 -03:00
6104035474 maintenance: restart ariadne after synapse token update 2026-01-28 18:37:49 -03:00
dabf043ce6 comms: force admin token to use othrys-seeder 2026-01-28 18:35:28 -03:00
9b8ef436c8 comms: fix vault_put indentation 2026-01-28 18:31:48 -03:00
8cf24a6c96 comms: source admin token from seeder access tokens 2026-01-28 18:29:49 -03:00
2797464b45 comms: mint synapse admin token with syt_ prefix 2026-01-28 18:20:37 -03:00
320cf901ba comms: rerun synapse admin ensure with device 2026-01-28 18:17:24 -03:00
5bb0fc126e comms: ensure synapse device for admin token 2026-01-28 18:10:55 -03:00
1b8271ed61 maintenance: restart ariadne after synapse token 2026-01-28 17:59:25 -03:00
fab030e9c0 comms: rotate invalid synapse admin token 2026-01-28 17:57:39 -03:00
be6b65cedb comms: rerun synapse admin ensure job 2026-01-28 17:54:53 -03:00
cbed39bd64 comms: run synapse admin ensure job 2026-01-28 17:50:01 -03:00
445622e936 comms: use bundled synapse admin ensure image 2026-01-28 17:47:58 -03:00
17e28d2891 maintenance: restart ariadne to reload secrets 2026-01-28 17:31:25 -03:00
8325827c41 comms: suspend synapse admin ensure job 2026-01-28 17:29:07 -03:00
7c7ed38ead comms: fix synapse admin ensure vault login 2026-01-28 17:27:39 -03:00
5d2fb32ff8 comms: rebuild synapse admin ensure job 2026-01-28 17:25:34 -03:00
flux-bot
b62a5ba3fb chore(maintenance): automated image update 2026-01-28 20:21:37 +00:00
359445ab43 comms: run synapse admin ensure job 2026-01-28 17:19:55 -03:00
4d1382cfc9 maintenance: track ariadne latest image 2026-01-28 14:04:58 -03:00
b66c7de5fd monitoring: avoid ariadne alert title conflict 2026-01-28 14:02:12 -03:00
3d4e5bdde1 monitoring: disable legacy cron alert 2026-01-28 13:58:28 -03:00
f37baf2447 monitoring: restart grafana to reload alerts 2026-01-28 13:53:33 -03:00
ad3d8d75c9 monitoring: reuse maint-cron uid for ariadne alert 2026-01-28 13:52:12 -03:00
4ecfdcef7c monitoring: restart grafana for ariadne alerts 2026-01-28 13:49:41 -03:00
flux-bot
63ae3e3f6f chore(comms): automated image update 2026-01-28 16:49:09 +00:00
eab2ce50b1 monitoring: alert on ariadne schedules 2026-01-28 13:47:54 -03:00
flux-bot
523db13be0 chore(maintenance): automated image update 2026-01-28 16:47:19 +00:00
6a3f8cffe1 comms: fix MAS job indentation 2026-01-28 13:25:51 -03:00
80a0f424cd comms: tolerate MAS login rate limits 2026-01-28 13:23:25 -03:00
8e9d85ccd7 comms: stop seeding atlas bots in synapse job 2026-01-28 13:18:44 -03:00
85abd589d4 comms: inject quick/smart bot creds for MAS job 2026-01-28 13:12:02 -03:00
flux-bot
bfbd707293 chore(bstein-dev-home): automated image update 2026-01-28 16:07:02 +00:00
flux-bot
526a895775 chore(bstein-dev-home): automated image update 2026-01-28 16:06:02 +00:00
38e1eba112 comms: add atlas quick/smart bots 2026-01-28 13:01:09 -03:00
flux-bot
f9e6cabe6d chore(comms): automated image update 2026-01-28 15:59:05 +00:00
36bb695c15 monitoring: fix grafana pod annotation indent 2026-01-28 12:37:42 -03:00
flux-bot
b449b65244 chore(comms): automated image update 2026-01-28 15:35:02 +00:00
1a9651914e monitoring: restart grafana after alert fix 2026-01-28 12:32:56 -03:00
flux-bot
9e5be20983 chore(comms): automated image update 2026-01-28 15:32:23 +00:00
d55bc98bbe monitoring: fix postmark alert metrics 2026-01-28 12:31:33 -03:00
flux-bot
46d677f5e7 chore(comms): automated image update 2026-01-28 15:22:49 +00:00
ef63b0f9f3 feat: add nats platform kustomization 2026-01-28 12:15:39 -03:00
111ae84255 chore: move flux sync to feature/atlasbot 2026-01-28 12:12:23 -03:00
d78a3c2550 comms: allow atlasbot to pull harbor images 2026-01-28 11:54:11 -03:00
fb89158622 atlasbot: move to service image and add nats queue infra 2026-01-28 11:52:37 -03:00
94 changed files with 8133 additions and 383 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
!README.md !README.md
!knowledge/**/*.md !knowledge/**/*.md
!services/comms/knowledge/**/*.md !services/comms/knowledge/**/*.md
!services/atlasbot/knowledge/**/*.md
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
.pytest_cache .pytest_cache

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/atlasbot

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: atlasbot
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/atlasbot
targetNamespace: ai
timeout: 2m
dependsOn:
- name: ai-llm

View File

@ -13,14 +13,14 @@ spec:
git: git:
checkout: checkout:
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
commit: commit:
author: author:
email: ops@bstein.dev email: ops@bstein.dev
name: flux-bot name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update" messageTemplate: "chore(bstein-dev-home): automated image update"
push: push:
branch: feature/ariadne branch: feature/atlasbot
update: update:
strategy: Setters strategy: Setters
path: services/bstein-dev-home path: services/bstein-dev-home

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: comms
namespace: comms
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(comms): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/comms

View File

@ -6,6 +6,9 @@ resources:
- vault/kustomization.yaml - vault/kustomization.yaml
- vaultwarden/kustomization.yaml - vaultwarden/kustomization.yaml
- comms/kustomization.yaml - comms/kustomization.yaml
- comms/image-automation.yaml
- atlasbot/kustomization.yaml
- atlasbot/image-automation.yaml
- crypto/kustomization.yaml - crypto/kustomization.yaml
- monerod/kustomization.yaml - monerod/kustomization.yaml
- pegasus/kustomization.yaml - pegasus/kustomization.yaml

View File

@ -9,7 +9,7 @@ metadata:
spec: spec:
interval: 1m0s interval: 1m0s
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
secretRef: secretRef:
name: flux-system-gitea name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -16,5 +16,6 @@ resources:
- longhorn/kustomization.yaml - longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml - longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml - postgres/kustomization.yaml
- nats/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml - ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml - ../platform/vault-injector/kustomization.yaml

View File

@ -13,14 +13,14 @@ spec:
git: git:
checkout: checkout:
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
commit: commit:
author: author:
email: ops@bstein.dev email: ops@bstein.dev
name: flux-bot name: flux-bot
messageTemplate: "chore(maintenance): automated image update" messageTemplate: "chore(maintenance): automated image update"
push: push:
branch: feature/ariadne branch: feature/atlasbot
update: update:
strategy: Setters strategy: Setters
path: services/maintenance path: services/maintenance

View File

@ -0,0 +1,21 @@
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: nats
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/nats
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: nats
healthChecks:
- apiVersion: apps/v1
kind: StatefulSet
name: nats
namespace: nats
wait: true

View File

@ -0,0 +1,3 @@
FROM python:3.11-slim
RUN pip install --no-cache-dir psycopg2-binary bcrypt

View File

@ -6,6 +6,7 @@ resources:
- ../modules/profiles/atlas-ha - ../modules/profiles/atlas-ha
- coredns-custom.yaml - coredns-custom.yaml
- coredns-deployment.yaml - coredns-deployment.yaml
- longhorn-node-taints.yaml
- ntp-sync-daemonset.yaml - ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml - ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,40 @@
# infrastructure/core/longhorn-node-taints.yaml
apiVersion: v1
kind: Node
metadata:
name: titan-13
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-15
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-17
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-19
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule

View File

@ -0,0 +1,10 @@
# infrastructure/longhorn/core/backup-target.yaml
apiVersion: longhorn.io/v1beta2
kind: BackupTarget
metadata:
name: default
namespace: longhorn-system
spec:
backupTargetURL: "s3://atlas-soteria@us-west-004/"
credentialSecret: longhorn-backup-b2
pollInterval: 5m0s

View File

@ -6,6 +6,39 @@ metadata:
namespace: longhorn-system namespace: longhorn-system
spec: spec:
interval: 30m interval: 30m
postRenderers:
- kustomize:
patches:
- target:
kind: Service
name: longhorn-conversion-webhook
namespace: longhorn-system
patch: |
- op: add
path: /spec/publishNotReadyAddresses
value: true
- target:
kind: Service
name: longhorn-admission-webhook
namespace: longhorn-system
patch: |
- op: add
path: /spec/publishNotReadyAddresses
value: true
- target:
kind: DaemonSet
name: longhorn-manager
namespace: longhorn-system
patch: |
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/path
value: /v1/healthz
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/port
value: 9500
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/scheme
value: HTTP
chart: chart:
spec: spec:
chart: longhorn chart: longhorn
@ -34,7 +67,7 @@ spec:
createSecret: false createSecret: false
registrySecret: longhorn-registry registrySecret: longhorn-registry
image: image:
pullPolicy: Always pullPolicy: IfNotPresent
longhorn: longhorn:
engine: engine:
repository: registry.bstein.dev/infra/longhorn-engine repository: registry.bstein.dev/infra/longhorn-engine
@ -77,4 +110,4 @@ spec:
repository: registry.bstein.dev/infra/longhorn-livenessprobe repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0 tag: v2.16.0
defaultSettings: defaultSettings:
systemManagedPodsImagePullPolicy: Always systemManagedPodsImagePullPolicy: IfNotPresent

View File

@ -6,6 +6,7 @@ resources:
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- secretproviderclass.yaml - secretproviderclass.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- backup-target.yaml
- helmrelease.yaml - helmrelease.yaml
- longhorn-settings-ensure-job.yaml - longhorn-settings-ensure-job.yaml

View File

@ -13,9 +13,27 @@ spec:
- objectName: "harbor-pull__dockerconfigjson" - objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull" secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson" secretKey: "dockerconfigjson"
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn_backup__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects: secretObjects:
- secretName: longhorn-registry - secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson type: kubernetes.io/dockerconfigjson
data: data:
- objectName: harbor-pull__dockerconfigjson - objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson key: .dockerconfigjson
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn_backup__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nats-config
namespace: nats
labels:
app: nats
component: config
annotations:
description: "NATS JetStream configuration"
data:
nats.conf: |
jetstream {
store_dir: /data
max_mem_store: 128MB
max_file_store: 1GB
}

View File

@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- configmap.yaml
- service.yaml
- statefulset.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: nats

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
selector:
app: nats
ports:
- name: client
port: 4222
targetPort: 4222
- name: monitoring
port: 8222
targetPort: 8222

View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
serviceName: nats
replicas: 1
selector:
matchLabels:
app: nats
template:
metadata:
labels:
app: nats
spec:
containers:
- name: nats
image: nats:2.10.18
args:
- "-c"
- "/etc/nats/nats.conf"
ports:
- name: client
containerPort: 4222
- name: monitoring
containerPort: 8222
volumeMounts:
- name: config
mountPath: /etc/nats
- name: data
mountPath: /data
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: nats-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi

View File

@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
} }
NAMESPACE_CPU_WINDOW = "1m" NAMESPACE_CPU_WINDOW = "1m"
GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Cluster metadata # Cluster metadata
@ -235,13 +236,16 @@ def gpu_util_by_hostname():
def gpu_node_labels(): def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}' return (
f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
' or kube_node_labels{label_jetson="true"}'
)
def gpu_requests_by_namespace_node(scope_var): def gpu_requests_by_namespace_node(scope_var):
return ( return (
"sum by (namespace,node) (" "sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})" f"* on(node) group_left() ({gpu_node_labels()})"
")" ")"
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
total_by_node = f"sum by (node) ({requests_by_ns})" total_by_node = f"sum by (node) ({requests_by_ns})"
return ( return (
"sum by (namespace) (" "sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})" f"* on(node) group_left() ({gpu_util_by_node()})"
")" ")"
) )
@ -419,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
) )
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
ARIADNE_TEST_SUCCESS_RATE = ( TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
TEST_SUCCESS_RATE = (
"100 * " "100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) ' f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
"/ clamp_min(" "/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)' f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
) )
ARIADNE_TEST_FAILURES_24H = ( TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
) )
POSTGRES_CONN_USED = ( POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1290,23 +1295,25 @@ def build_overview():
}, },
} }
) )
panels.append( test_success = timeseries_panel(
timeseries_panel(
42, 42,
"Ariadne Test Success Rate", "Platform Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE, TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14}, {"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent", unit="percent",
max_value=100, max_value=100,
legend=None, legend=None,
legend_display="list", legend_display="list",
) )
test_success["description"] = (
"Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
"Add new test series there first so they roll up here."
) )
panels.append( panels.append(test_success)
bargauge_panel( test_failures = bargauge_panel(
43, 43,
"Tests with Failures (24h)", "Platform Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H, TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14}, {"h": 6, "w": 6, "x": 18, "y": 14},
unit="none", unit="none",
instant=True, instant=True,
@ -1331,7 +1338,10 @@ def build_overview():
], ],
}, },
) )
test_failures["description"] = (
"This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
) )
panels.append(test_failures)
cpu_scope = "$namespace_scope_cpu" cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu" gpu_scope = "$namespace_scope_gpu"
@ -2649,29 +2659,31 @@ def build_jobs_dashboard():
legend="{{status}}", legend="{{status}}",
) )
) )
panels.append( coverage_panel = stat_panel(
stat_panel(
17, 17,
"Ariadne CI Coverage (%)", "Platform CI Coverage (%)",
ARIADNE_CI_COVERAGE, TEST_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11}, {"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent", unit="percent",
decimals=1, decimals=1,
instant=True, instant=True,
legend="{{branch}}", legend="{{branch}}",
) )
) coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups."
panels.append( panels.append(coverage_panel)
table_panel( tests_panel = table_panel(
18, 18,
"Ariadne CI Tests (latest)", "Platform CI Tests (latest)",
ARIADNE_CI_TESTS, TEST_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11}, {"h": 6, "w": 12, "x": 12, "y": 11},
unit="none", unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True, instant=True,
) )
tests_panel["description"] = (
"Atlas Overview test panels depend on these internal repo-tagged CI series."
) )
panels.append(tests_panel)
return { return {
"uid": "atlas-jobs", "uid": "atlas-jobs",

View File

@ -539,9 +539,9 @@ def main() -> int:
help="Write generated files (otherwise just print a summary).", help="Write generated files (otherwise just print a summary).",
) )
ap.add_argument( ap.add_argument(
"--sync-comms", "--sync-atlasbot",
action="store_true", action="store_true",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
) )
args = ap.parse_args() args = ap.parse_args()
@ -632,10 +632,10 @@ def main() -> int:
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_comms: if args.sync_atlasbot:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
_sync_tree(out_dir, comms_dir) _sync_tree(out_dir, atlasbot_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
return 0 return 0

View File

@ -3,7 +3,7 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
labels: labels:
app: atlasbot app: atlasbot
spec: spec:
@ -18,7 +18,7 @@ spec:
annotations: annotations:
checksum/atlasbot-configmap: manual-atlasbot-101 checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms" vault.hashicorp.com/role: "ai"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
vault.hashicorp.com/agent-inject-template-turn-secret: | vault.hashicorp.com/agent-inject-template-turn-secret: |
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}} {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,6 +28,15 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: | vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: | vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -58,17 +67,17 @@ spec:
hardware: rpi5 hardware: rpi5
containers: containers:
- name: atlasbot - name: atlasbot
image: python:3.11-slim image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
command: ["/bin/sh","-c"] command: ["/bin/sh","-c"]
args: args:
- | - |
. /vault/scripts/comms_vault_env.sh . /vault/scripts/atlasbot_vault_env.sh
exec python /app/bot.py exec python -m atlasbot.main
env: env:
- name: MATRIX_BASE - name: MATRIX_BASE
value: http://othrys-synapse-matrix-synapse:8008 value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
- name: AUTH_BASE - name: AUTH_BASE
value: http://matrix-authentication-service:8080 value: http://matrix-authentication-service.comms.svc.cluster.local:8080
- name: KB_DIR - name: KB_DIR
value: /kb value: /kb
- name: VM_URL - name: VM_URL
@ -76,27 +85,69 @@ spec:
- name: ARIADNE_STATE_URL - name: ARIADNE_STATE_URL
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
- name: BOT_MENTIONS - name: BOT_MENTIONS
value: atlasbot,aatlasbot,atlas_quick,atlas_smart value: atlas-quick,atlas-smart,atlas-genius
- name: OLLAMA_URL - name: OLLAMA_URL
value: http://ollama.ai.svc.cluster.local:11434 value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL - name: OLLAMA_MODEL
value: qwen2.5:14b-instruct
- name: ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0 value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP - name: ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct value: qwen2.5-coder:7b-instruct-q4_0
- name: ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_FALLBACK_MODEL - name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0 value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC - name: OLLAMA_TIMEOUT_SEC
value: "600" value: "600"
- name: OLLAMA_RETRIES
value: "0"
- name: ATLASBOT_THINKING_INTERVAL_SEC - name: ATLASBOT_THINKING_INTERVAL_SEC
value: "120" value: "30"
- name: ATLASBOT_QUICK_TIME_BUDGET_SEC
value: "15"
- name: ATLASBOT_SMART_TIME_BUDGET_SEC
value: "45"
- name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
value: "180"
- name: ATLASBOT_SNAPSHOT_TTL_SEC - name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30" value: "30"
- name: ATLASBOT_HTTP_PORT - name: ATLASBOT_HTTP_PORT
value: "8090" value: "8090"
- name: ATLASBOT_STATE_DB
value: /data/atlasbot_state.db
- name: ATLASBOT_QUEUE_ENABLED
value: "false"
- name: ATLASBOT_DEBUG_PIPELINE
value: "true"
- name: ATLASBOT_NATS_URL
value: nats://nats.nats.svc.cluster.local:4222
- name: ATLASBOT_NATS_STREAM
value: atlasbot
- name: ATLASBOT_NATS_SUBJECT
value: atlasbot.requests
- name: ATLASBOT_FAST_MAX_ANGLES
value: "2"
- name: ATLASBOT_SMART_MAX_ANGLES
value: "5"
- name: ATLASBOT_FAST_MAX_CANDIDATES
value: "2"
- name: ATLASBOT_SMART_MAX_CANDIDATES
value: "6"
- name: ATLASBOT_FAST_LLM_CALLS_MAX
value: "8"
- name: ATLASBOT_SMART_LLM_CALLS_MAX
value: "24"
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
value: "72"
ports: ports:
- name: http - name: http
containerPort: 8090 containerPort: 8090
@ -108,19 +159,15 @@ spec:
cpu: 500m cpu: 500m
memory: 512Mi memory: 512Mi
volumeMounts: volumeMounts:
- name: code
mountPath: /app/bot.py
subPath: bot.py
- name: kb - name: kb
mountPath: /kb mountPath: /kb
readOnly: true readOnly: true
- name: vault-scripts - name: vault-scripts
mountPath: /vault/scripts mountPath: /vault/scripts
readOnly: true readOnly: true
- name: atlasbot-state
mountPath: /data
volumes: volumes:
- name: code
configMap:
name: atlasbot
- name: kb - name: kb
configMap: configMap:
name: atlas-kb name: atlas-kb
@ -139,5 +186,7 @@ spec:
path: diagrams/atlas-http.mmd path: diagrams/atlas-http.mmd
- name: vault-scripts - name: vault-scripts
configMap: configMap:
name: comms-vault-env name: atlasbot-vault-env
defaultMode: 0555 defaultMode: 0555
- name: atlasbot-state
emptyDir: {}

View File

@ -3,7 +3,9 @@ apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
imagePullSecrets:
- name: harbor-regcred
--- ---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
@ -43,5 +45,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: atlasbot name: atlasbot
namespace: comms namespace: ai

View File

@ -2,7 +2,7 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
labels: labels:
app: atlasbot app: atlasbot
spec: spec:

View File

@ -0,0 +1,26 @@
# services/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
name: flux-bot
email: ops@bstein.dev
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
path: services/atlasbot
strategy: Setters

View File

@ -0,0 +1,23 @@
# services/comms/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: atlasbot
namespace: ai
spec:
image: registry.bstein.dev/bstein/atlasbot
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: atlasbot
namespace: ai
spec:
imageRepositoryRef:
name: atlasbot
policy:
semver:
range: ">=0.1.0-0"

View File

@ -0,0 +1,22 @@
Atlas Knowledge Base (KB)
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
Regeneration
- Update manifests/docs, then regenerate generated artifacts:
- `python scripts/knowledge_render_atlas.py --write`
Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.

View File

@ -0,0 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,234 @@
flowchart LR
host_auth_bstein_dev["auth.bstein.dev"]
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
host_auth_bstein_dev --> svc_sso_oauth2_proxy
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
host_bstein_dev["bstein.dev"]
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
host_bstein_dev --> svc_comms_matrix_wellknown
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
wl_comms_element_call["comms/element-call (Deployment)"]
svc_comms_element_call --> wl_comms_element_call
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
host_ci_bstein_dev["ci.bstein.dev"]
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
host_ci_bstein_dev --> svc_jenkins_jenkins
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
svc_jenkins_jenkins --> wl_jenkins_jenkins
host_cloud_bstein_dev["cloud.bstein.dev"]
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
svc_comms_livekit["comms/livekit (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
host_mail_bstein_dev["mail.bstein.dev"]
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
svc_nextcloud_collabora --> wl_nextcloud_collabora
host_pegasus_bstein_dev["pegasus.bstein.dev"]
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
host_scm_bstein_dev["scm.bstein.dev"]
svc_gitea_gitea["gitea/gitea (Service)"]
host_scm_bstein_dev --> svc_gitea_gitea
wl_gitea_gitea["gitea/gitea (Deployment)"]
svc_gitea_gitea --> wl_gitea_gitea
host_secret_bstein_dev["secret.bstein.dev"]
svc_vault_vault["vault/vault (Service)"]
host_secret_bstein_dev --> svc_vault_vault
wl_vault_vault["vault/vault (StatefulSet)"]
svc_vault_vault --> wl_vault_vault
host_sso_bstein_dev["sso.bstein.dev"]
svc_sso_keycloak["sso/keycloak (Service)"]
host_sso_bstein_dev --> svc_sso_keycloak
wl_sso_keycloak["sso/keycloak (Deployment)"]
svc_sso_keycloak --> wl_sso_keycloak
host_stream_bstein_dev["stream.bstein.dev"]
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
subgraph bstein_dev_home[bstein-dev-home]
svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend
svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend
svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway
end
subgraph comms[comms]
svc_comms_matrix_wellknown
wl_comms_matrix_wellknown
svc_comms_element_call
wl_comms_element_call
svc_comms_livekit_token_service
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
svc_jellyfin_jellyfin
wl_jellyfin_jellyfin
end
subgraph jenkins[jenkins]
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
end
subgraph mailu_mailserver[mailu-mailserver]
svc_mailu_mailserver_mailu_front
end
subgraph nextcloud[nextcloud]
svc_nextcloud_nextcloud
wl_nextcloud_nextcloud
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy
svc_sso_keycloak
wl_sso_keycloak
end
subgraph vault[vault]
svc_vault_vault
wl_vault_vault
end
subgraph vaultwarden[vaultwarden]
svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden
end

View File

@ -0,0 +1,29 @@
# services/atlasbot/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- atlasbot-rbac.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- image.yaml
- image-automation.yaml
images:
- name: registry.bstein.dev/bstein/atlasbot
newTag: 0.1.2-106 # {"$imagepolicy": "ai:atlasbot:tag"}
configMapGenerator:
- name: atlasbot-vault-env
files:
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
options:
disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -0,0 +1,44 @@
#!/usr/bin/env sh
set -eu
vault_dir="/vault/secrets"
read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
export PGPASSWORD="$(read_secret synapse-db-pass)"
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"

View File

@ -0,0 +1,21 @@
# services/atlasbot/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: atlasbot-vault
namespace: ai
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "ai"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -0,0 +1,34 @@
# services/atlasbot/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: atlasbot-vault-sync
namespace: ai
spec:
replicas: 1
selector:
matchLabels:
app: atlasbot-vault-sync
template:
metadata:
labels:
app: atlasbot-vault-sync
spec:
serviceAccountName: atlasbot
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: atlasbot-vault

View File

@ -68,7 +68,13 @@ spec:
- name: AI_CHAT_TIMEOUT_SEC - name: AI_CHAT_TIMEOUT_SEC
value: "480" value: "480"
- name: AI_ATLASBOT_ENDPOINT - name: AI_ATLASBOT_ENDPOINT
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
- name: AI_ATLASBOT_MODEL_FAST
value: qwen2.5-coder:7b-instruct-q4_0
- name: AI_ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct
- name: AI_ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct
- name: AI_ATLASBOT_TIMEOUT_SEC - name: AI_ATLASBOT_TIMEOUT_SEC
value: "30" value: "30"
- name: AI_NODE_NAME - name: AI_NODE_NAME

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml - ingress.yaml
images: images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend - name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator: configMapGenerator:
- name: chat-ai-gateway - name: chat-ai-gateway
namespace: bstein-dev-home namespace: bstein-dev-home

View File

@ -13,10 +13,7 @@ resources:
- element-call-deployment.yaml - element-call-deployment.yaml
- guest-register-deployment.yaml - guest-register-deployment.yaml
- guest-register-service.yaml - guest-register-service.yaml
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- wellknown.yaml - wellknown.yaml
- atlasbot-rbac.yaml
- mas-secrets-ensure-rbac.yaml - mas-secrets-ensure-rbac.yaml
- comms-secrets-ensure-rbac.yaml - comms-secrets-ensure-rbac.yaml
- mas-db-ensure-rbac.yaml - mas-db-ensure-rbac.yaml
@ -43,7 +40,6 @@ resources:
- livekit-ingress.yaml - livekit-ingress.yaml
- livekit-middlewares.yaml - livekit-middlewares.yaml
- matrix-ingress.yaml - matrix-ingress.yaml
configMapGenerator: configMapGenerator:
- name: comms-vault-env - name: comms-vault-env
files: files:
@ -60,21 +56,8 @@ configMapGenerator:
- server.py=scripts/guest-register/server.py - server.py=scripts/guest-register/server.py
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: atlasbot
files:
- bot.py=scripts/atlasbot/bot.py
options:
disableNameSuffixHash: true
- name: othrys-element-host-config - name: othrys-element-host-config
files: files:
- 20-host-config.sh=scripts/element-host-config.sh - 20-host-config.sh=scripts/element-host-config.sh
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -7,6 +7,7 @@ metadata:
kubernetes.io/ingress.class: traefik kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true" traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.priority: "120"
cert-manager.io/cluster-issuer: letsencrypt cert-manager.io/cluster-issuer: letsencrypt
spec: spec:
ingressClassName: traefik ingressClassName: traefik
@ -43,6 +44,13 @@ spec:
name: matrix-authentication-service name: matrix-authentication-service
port: port:
number: 8080 number: 8080
- path: /_matrix/client/r0/login
pathType: Prefix
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /_matrix/client/v3/logout - path: /_matrix/client/v3/logout
pathType: Exact pathType: Exact
backend: backend:
@ -57,6 +65,41 @@ spec:
name: matrix-authentication-service name: matrix-authentication-service
port: port:
number: 8080 number: 8080
- path: /account
pathType: Prefix
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /authorize
pathType: Prefix
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /oauth2
pathType: Prefix
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /.well-known/openid-configuration
pathType: Exact
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /.well-known/oauth-authorization-server
pathType: Exact
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /_matrix - path: /_matrix
pathType: Prefix pathType: Prefix
backend: backend:
@ -102,6 +145,13 @@ spec:
name: matrix-authentication-service name: matrix-authentication-service
port: port:
number: 8080 number: 8080
- path: /_matrix/client/r0/login
pathType: Prefix
backend:
service:
name: matrix-authentication-service
port:
number: 8080
- path: /_matrix/client/v3/logout - path: /_matrix/client/v3/logout
pathType: Exact pathType: Exact
backend: backend:

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/comms-secrets-ensure-job.yaml # services/comms/oneoffs/comms-secrets-ensure-job.yaml
# One-off job for comms/comms-secrets-ensure-7. # One-off job for comms/comms-secrets-ensure-8.
# Purpose: comms secrets ensure 7 (see container args/env in this file). # Purpose: comms secrets ensure 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: comms-secrets-ensure-7 name: comms-secrets-ensure-8
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: true
@ -87,6 +87,9 @@ spec:
ensure_key "comms/synapse-redis" "redis-password" >/dev/null ensure_key "comms/synapse-redis" "redis-password" >/dev/null
ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")" SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/mas-local-users-ensure-job.yaml # services/comms/oneoffs/mas-local-users-ensure-job.yaml
# One-off job for comms/mas-local-users-ensure-18. # One-off job for comms/mas-local-users-ensure-19.
# Purpose: mas local users ensure 18 (see container args/env in this file). # Purpose: mas local users ensure 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: mas-local-users-ensure-18 name: mas-local-users-ensure-19
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: true
@ -27,6 +27,12 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: | vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: | vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -92,7 +98,13 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
command: command:
- /bin/sh - /bin/sh
- -c - -c
@ -225,11 +237,27 @@ spec:
}, },
timeout=30, timeout=30,
) )
if r.status_code == 429:
return False
if r.status_code != 200: if r.status_code != 200:
raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}") raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
return True
wait_for_service(MAS_ADMIN_API_BASE) wait_for_service(MAS_ADMIN_API_BASE)
token = admin_token() token = admin_token()
bot_quick = os.environ.get("BOT_USER_QUICK", "")
bot_smart = os.environ.get("BOT_USER_SMART", "")
bot_genius = os.environ.get("BOT_USER_GENIUS", "")
bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"]) ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
if bot_quick and bot_quick_pass:
ensure_user(token, bot_quick, bot_quick_pass)
if bot_smart and bot_smart_pass:
ensure_user(token, bot_smart, bot_smart_pass)
if bot_genius and bot_genius_pass:
ensure_user(token, bot_genius, bot_genius_pass)
PY PY

View File

@ -1,15 +1,15 @@
# services/comms/oneoffs/synapse-admin-ensure-job.yaml # services/comms/oneoffs/synapse-admin-ensure-job.yaml
# One-off job for comms/synapse-admin-ensure-3. # One-off job for comms/synapse-admin-ensure-15.
# Purpose: synapse admin ensure 3 (see container args/env in this file). # Purpose: synapse admin ensure 15 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: synapse-admin-ensure-3 name: synapse-admin-ensure-15
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: false
backoffLimit: 0 backoffLimit: 0
ttlSecondsAfterFinished: 3600 ttlSecondsAfterFinished: 3600
template: template:
@ -32,7 +32,8 @@ spec:
values: ["arm64"] values: ["arm64"]
containers: containers:
- name: ensure - name: ensure
image: python:3.11-slim image: python:3.12-slim
imagePullPolicy: Always
env: env:
- name: VAULT_ADDR - name: VAULT_ADDR
value: http://vault.vault.svc.cluster.local:8200 value: http://vault.vault.svc.cluster.local:8200
@ -45,22 +46,20 @@ spec:
- -c - -c
- | - |
set -euo pipefail set -euo pipefail
pip install --no-cache-dir psycopg2-binary bcrypt python -m pip install --no-cache-dir psycopg2-binary
python - <<'PY' python - <<'PY'
import json import json
import os import os
import secrets
import string
import time
import urllib.error import urllib.error
import urllib.parse
import urllib.request import urllib.request
import bcrypt
import psycopg2 import psycopg2
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
PGHOST = "postgres-service.postgres.svc.cluster.local" PGHOST = "postgres-service.postgres.svc.cluster.local"
PGPORT = 5432 PGPORT = 5432
PGDATABASE = "synapse" PGDATABASE = "synapse"
@ -113,48 +112,15 @@ spec:
with urllib.request.urlopen(req, timeout=30) as resp: with urllib.request.urlopen(req, timeout=30) as resp:
resp.read() resp.read()
def random_password(length: int = 32) -> str:
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(length))
def ensure_admin_creds(token: str) -> dict: def ensure_admin_creds(token: str) -> dict:
data = vault_get(token, "comms/synapse-admin") data = vault_get(token, "comms/synapse-admin")
username = (data.get("username") or "").strip() or "synapse-admin" username = "othrys-seeder"
password = (data.get("password") or "").strip() if data.get("username") != username:
if not password:
password = random_password()
data["username"] = username data["username"] = username
data["password"] = password data.pop("access_token", None)
vault_put(token, "comms/synapse-admin", data) vault_put(token, "comms/synapse-admin", data)
return data return data
def ensure_user(cur, cols, user_id, password, admin):
now_ms = int(time.time() * 1000)
values = {
"name": user_id,
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
"creation_ts": now_ms,
}
def add_flag(name, flag):
if name not in cols:
return
if cols[name]["type"] in ("smallint", "integer"):
values[name] = int(flag)
else:
values[name] = bool(flag)
add_flag("admin", admin)
add_flag("deactivated", False)
add_flag("shadow_banned", False)
add_flag("is_guest", False)
columns = list(values.keys())
placeholders = ", ".join(["%s"] * len(columns))
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
cur.execute(query, [values[c] for c in columns])
def get_cols(cur): def get_cols(cur):
cur.execute( cur.execute(
""" """
@ -172,30 +138,40 @@ spec:
} }
return cols return cols
def ensure_access_token(cur, user_id, token_value): def admin_token_valid(token: str, user_id: str) -> bool:
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens") if not token or not SYNAPSE_ADMIN_URL:
token_id = cur.fetchone()[0] return False
cur.execute( encoded = urllib.parse.quote(user_id, safe="")
""" url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms) req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
VALUES (%s, %s, %s, %s, NULL) try:
ON CONFLICT (token) DO NOTHING with urllib.request.urlopen(req, timeout=30) as resp:
""", resp.read()
(token_id, user_id, token_value, "ariadne-admin"), return True
) except urllib.error.HTTPError as exc:
if exc.code == 404:
return True
if exc.code in (401, 403):
return False
raise
vault_token = vault_login() vault_token = vault_login()
admin_data = ensure_admin_creds(vault_token) admin_data = ensure_admin_creds(vault_token)
if admin_data.get("access_token"): user_id = f"@{admin_data['username']}:live.bstein.dev"
log("synapse admin token already present") existing_token = admin_data.get("access_token")
if existing_token and admin_token_valid(existing_token, user_id):
log("synapse admin token already present and valid")
raise SystemExit(0) raise SystemExit(0)
if existing_token:
log("synapse admin token invalid; rotating")
admin_data.pop("access_token", None)
vault_put(vault_token, "comms/synapse-admin", admin_data)
synapse_db = vault_get(vault_token, "comms/synapse-db") synapse_db = vault_get(vault_token, "comms/synapse-db")
pg_password = synapse_db.get("POSTGRES_PASSWORD") pg_password = synapse_db.get("POSTGRES_PASSWORD")
if not pg_password: if not pg_password:
raise RuntimeError("synapse db password missing") raise RuntimeError("synapse db password missing")
user_id = f"@{admin_data['username']}:live.bstein.dev"
conn = psycopg2.connect( conn = psycopg2.connect(
host=PGHOST, host=PGHOST,
port=PGPORT, port=PGPORT,
@ -203,17 +179,34 @@ spec:
user=PGUSER, user=PGUSER,
password=pg_password, password=pg_password,
) )
token_value = secrets.token_urlsafe(32)
try: try:
with conn: with conn:
with conn.cursor() as cur: with conn.cursor() as cur:
cols = get_cols(cur) cols = get_cols(cur)
ensure_user(cur, cols, user_id, admin_data["password"], True) if "admin" not in cols:
ensure_access_token(cur, user_id, token_value) raise RuntimeError("users.admin column missing")
cur.execute(
"UPDATE users SET admin = TRUE WHERE name = %s",
(user_id,),
)
cur.execute(
"""
SELECT token FROM access_tokens
WHERE user_id = %s AND valid_until_ms IS NULL
ORDER BY id DESC LIMIT 1
""",
(user_id,),
)
row = cur.fetchone()
if not row:
raise RuntimeError(f"no access token found for {user_id}")
token_value = row[0]
finally: finally:
conn.close() conn.close()
admin_data["access_token"] = token_value admin_data["access_token"] = token_value
vault_put(vault_token, "comms/synapse-admin", admin_data) vault_put(vault_token, "comms/synapse-admin", admin_data)
if not admin_token_valid(token_value, user_id):
raise RuntimeError("synapse admin token validation failed")
log("synapse admin token stored") log("synapse admin token stored")
PY PY

View File

@ -82,8 +82,6 @@ spec:
value: synapse value: synapse
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER
value: atlasbot
command: command:
- /bin/sh - /bin/sh
- -c - -c
@ -141,10 +139,8 @@ spec:
cur.execute(query, [values[c] for c in columns]) cur.execute(query, [values[c] for c in columns])
seeder_user = os.environ["SEEDER_USER"] seeder_user = os.environ["SEEDER_USER"]
bot_user = os.environ["BOT_USER"]
server = "live.bstein.dev" server = "live.bstein.dev"
seeder_id = f"@{seeder_user}:{server}" seeder_id = f"@{seeder_user}:{server}"
bot_id = f"@{bot_user}:{server}"
conn = psycopg2.connect( conn = psycopg2.connect(
host=os.environ["PGHOST"], host=os.environ["PGHOST"],
@ -158,7 +154,6 @@ spec:
with conn.cursor() as cur: with conn.cursor() as cur:
cols = get_cols(cur) cols = get_cols(cur)
upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True) upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
finally: finally:
conn.close() conn.close()
PY PY

View File

@ -76,7 +76,7 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
command: command:
- /bin/sh - /bin/sh
- -c - -c

View File

@ -11,14 +11,21 @@ from urllib import error, parse, request
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008") BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080") AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
USER = os.environ["BOT_USER"] BOT_USER = os.environ["BOT_USER"]
PASSWORD = os.environ["BOT_PASS"] BOT_PASS = os.environ["BOT_PASS"]
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
BOT_USER_GENIUS = os.environ.get("BOT_USER_GENIUS", "").strip()
BOT_PASS_GENIUS = os.environ.get("BOT_PASS_GENIUS", "").strip()
ROOM_ALIAS = "#othrys:live.bstein.dev" ROOM_ALIAS = "#othrys:live.bstein.dev"
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") MODEL_SMART = os.environ.get("ATLASBOT_MODEL_SMART", os.environ.get("ATLASBOT_MODEL_DEEP", "")).strip()
MODEL_GENIUS = os.environ.get("ATLASBOT_MODEL_GENIUS", MODEL_SMART).strip()
FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
API_KEY = os.environ.get("CHAT_API_KEY", "") API_KEY = os.environ.get("CHAT_API_KEY", "")
OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
@ -31,7 +38,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "") ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "") ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -39,6 +46,9 @@ MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000")) MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
QUICK_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_QUICK_TIME_BUDGET_SEC", "15"))
SMART_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_SMART_TIME_BUDGET_SEC", "45"))
GENIUS_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_GENIUS_TIME_BUDGET_SEC", "180"))
OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@ -380,27 +390,104 @@ def _strip_bot_mention(text: str) -> str:
return cleaned or text.strip() return cleaned or text.strip()
def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: def _detect_mode_from_body(body: str, *, default: str = "smart") -> str:
lower = normalize_query(body or "") lower = normalize_query(body or "")
if "atlas_quick" in lower or "atlas-quick" in lower: if "atlas_quick" in lower or "atlas-quick" in lower:
return "fast" return "fast"
if "atlas_smart" in lower or "atlas-smart" in lower: if "atlas_smart" in lower or "atlas-smart" in lower:
return "deep" return "smart"
if "atlas_genius" in lower or "atlas-genius" in lower:
return "genius"
if lower.startswith("quick ") or lower.startswith("fast "): if lower.startswith("quick ") or lower.startswith("fast "):
return "fast" return "fast"
if lower.startswith("smart ") or lower.startswith("deep "): if lower.startswith("smart "):
return "deep" return "smart"
if lower.startswith("genius ") or lower.startswith("deep "):
return "genius"
return default return default
def _detect_mode(
content: dict[str, Any],
body: str,
*,
default: str = "smart",
account_user: str = "",
) -> str:
mode = _detect_mode_from_body(body, default=default)
mentions = content.get("m.mentions", {})
user_ids = mentions.get("user_ids", [])
if isinstance(user_ids, list):
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
return "fast"
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
return "smart"
if BOT_USER_GENIUS and normalize_user_id(BOT_USER_GENIUS).lower() in normalized:
return "genius"
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
return "smart"
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
return "fast"
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
return "smart"
if account_user and BOT_USER_GENIUS and normalize_user_id(account_user) == normalize_user_id(BOT_USER_GENIUS):
return "genius"
return mode
def _model_for_mode(mode: str) -> str: def _model_for_mode(mode: str) -> str:
if mode == "fast" and MODEL_FAST: if mode == "fast" and MODEL_FAST:
return MODEL_FAST return MODEL_FAST
if mode == "deep" and MODEL_DEEP: if mode == "smart" and MODEL_SMART:
return MODEL_DEEP return MODEL_SMART
if mode == "genius" and MODEL_GENIUS:
return MODEL_GENIUS
if mode == "deep" and MODEL_SMART:
return MODEL_SMART
return MODEL return MODEL
def _normalize_mode(mode: str) -> str:
normalized = (mode or "").strip().lower()
if normalized in {"quick", "fast"}:
return "fast"
if normalized in {"smart"}:
return "smart"
if normalized in {"genius", "deep"}:
return "genius"
return "smart"
def _mode_time_budget_sec(mode: str) -> float:
normalized = _normalize_mode(mode)
if normalized == "fast":
return max(1.0, QUICK_TIME_BUDGET_SEC)
if normalized == "smart":
return max(1.0, SMART_TIME_BUDGET_SEC)
if normalized == "genius":
return max(1.0, GENIUS_TIME_BUDGET_SEC)
return max(1.0, SMART_TIME_BUDGET_SEC)
def _mode_ollama_timeout_sec(mode: str) -> float:
normalized = _normalize_mode(mode)
budget = _mode_time_budget_sec(normalized)
if normalized == "fast":
return max(6.0, min(budget - 2.0, OLLAMA_TIMEOUT_SEC))
if normalized == "smart":
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
if normalized == "genius":
return max(20.0, min(budget - 10.0, OLLAMA_TIMEOUT_SEC))
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
def _mode_heartbeat_sec(mode: str) -> int:
normalized = _normalize_mode(mode)
budget = _mode_time_budget_sec(normalized)
return max(5, min(THINKING_INTERVAL_SEC, int(max(5.0, budget / 3.0))))
# Matrix HTTP helper. # Matrix HTTP helper.
def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
url = (base or BASE) + path url = (base or BASE) + path
@ -416,12 +503,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
raw = resp.read() raw = resp.read()
return json.loads(raw.decode()) if raw else {} return json.loads(raw.decode()) if raw else {}
def login() -> str: def login(user: str, password: str) -> str:
login_user = normalize_user_id(USER) login_user = normalize_user_id(user)
payload = { payload = {
"type": "m.login.password", "type": "m.login.password",
"identifier": {"type": "m.id.user", "user": login_user}, "identifier": {"type": "m.id.user", "user": login_user},
"password": PASSWORD, "password": password,
} }
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE) res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
return res["access_token"] return res["access_token"]
@ -2628,6 +2715,11 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
return combined return combined
def _merge_context_blocks(*blocks: str) -> str:
parts = [block.strip() for block in blocks if isinstance(block, str) and block.strip()]
return "\n\n".join(parts)
class ThoughtState: class ThoughtState:
def __init__(self, total_steps: int = 0): def __init__(self, total_steps: int = 0):
self._lock = threading.Lock() self._lock = threading.Lock()
@ -2985,6 +3077,7 @@ def _ollama_call_safe(
fallback: str, fallback: str,
system_override: str | None = None, system_override: str | None = None,
model: str | None = None, model: str | None = None,
timeout: float | None = None,
) -> str: ) -> str:
try: try:
return _ollama_call( return _ollama_call(
@ -2994,6 +3087,7 @@ def _ollama_call_safe(
use_history=False, use_history=False,
system_override=system_override, system_override=system_override,
model=model, model=model,
timeout=timeout,
) )
except Exception: except Exception:
return fallback return fallback
@ -3813,9 +3907,12 @@ def _open_ended_multi(
def _open_ended_total_steps(mode: str) -> int: def _open_ended_total_steps(mode: str) -> int:
if mode == "fast": normalized = _normalize_mode(mode)
if normalized == "fast":
return 2 return 2
return 9 if normalized == "smart":
return 3
return 4
def _fast_fact_lines( def _fast_fact_lines(
@ -4136,6 +4233,7 @@ def _open_ended_fast_single(
prompt: str, prompt: str,
*, *,
context: str, context: str,
fallback_context: str | None = None,
history_lines: list[str] | None = None, history_lines: list[str] | None = None,
state: ThoughtState | None = None, state: ThoughtState | None = None,
model: str, model: str,
@ -4143,24 +4241,26 @@ def _open_ended_fast_single(
if state: if state:
state.update("drafting", step=1, note="summarizing") state.update("drafting", step=1, note="summarizing")
working_context = _append_history_context(context, history_lines or []) if history_lines else context working_context = _append_history_context(context, history_lines or []) if history_lines else context
reply = _ollama_call( reply = _ollama_call_safe(
("atlasbot_fast", "atlasbot_fast"), ("atlasbot_fast", "atlasbot_fast"),
prompt, prompt,
context=working_context, context=working_context,
use_history=False, fallback="",
system_override=_open_ended_system(), system_override=_open_ended_system(),
model=model, model=model,
timeout=_mode_ollama_timeout_sec("fast"),
) )
if not _has_body_lines(reply): if not _has_body_lines(reply):
reply = _ollama_call( reply = _ollama_call_safe(
("atlasbot_fast", "atlasbot_fast"), ("atlasbot_fast", "atlasbot_fast"),
prompt + " Provide one clear sentence before the score lines.", prompt + " Provide one clear sentence before the score lines.",
context=working_context, context=working_context,
use_history=False, fallback="",
system_override=_open_ended_system(), system_override=_open_ended_system(),
model=model, model=model,
timeout=_mode_ollama_timeout_sec("fast"),
) )
fallback = _fallback_fact_answer(prompt, context) fallback = _fallback_fact_answer(prompt, fallback_context or context)
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
reply = fallback reply = fallback
if not _has_body_lines(reply): if not _has_body_lines(reply):
@ -4177,6 +4277,7 @@ def _open_ended_fast(
fact_lines: list[str], fact_lines: list[str],
fact_meta: dict[str, dict[str, Any]], fact_meta: dict[str, dict[str, Any]],
history_lines: list[str], history_lines: list[str],
extra_context: str = "",
state: ThoughtState | None = None, state: ThoughtState | None = None,
) -> str: ) -> str:
model = _model_for_mode("fast") model = _model_for_mode("fast")
@ -4197,6 +4298,7 @@ def _open_ended_fast(
selected_pack = _fact_pack_text(selected_lines, selected_meta) selected_pack = _fact_pack_text(selected_lines, selected_meta)
if _needs_full_fact_pack(prompt) or not selected_lines: if _needs_full_fact_pack(prompt) or not selected_lines:
selected_pack = fact_pack selected_pack = fact_pack
model_context = _merge_context_blocks(selected_pack, extra_context)
if not subjective and _needs_full_fact_pack(prompt): if not subjective and _needs_full_fact_pack(prompt):
fallback = _fallback_fact_answer(prompt, fact_pack) fallback = _fallback_fact_answer(prompt, fact_pack)
if fallback: if fallback:
@ -4205,7 +4307,8 @@ def _open_ended_fast(
state.total_steps = _open_ended_total_steps("fast") state.total_steps = _open_ended_total_steps("fast")
return _open_ended_fast_single( return _open_ended_fast_single(
prompt, prompt,
context=selected_pack, context=model_context,
fallback_context=selected_pack,
history_lines=history_lines, history_lines=history_lines,
state=state, state=state,
model=model, model=model,
@ -4219,16 +4322,55 @@ def _open_ended_deep(
fact_lines: list[str], fact_lines: list[str],
fact_meta: dict[str, dict[str, Any]], fact_meta: dict[str, dict[str, Any]],
history_lines: list[str], history_lines: list[str],
mode: str,
extra_context: str = "",
state: ThoughtState | None = None, state: ThoughtState | None = None,
) -> str: ) -> str:
return _open_ended_multi( normalized = _normalize_mode(mode)
prompt, model = _model_for_mode(normalized)
fact_pack=fact_pack, subjective = _is_subjective_query(prompt)
fact_lines=fact_lines, primary_tags = _primary_tags_for_prompt(prompt)
fact_meta=fact_meta, focus_tags = _preferred_tags_for_prompt(prompt)
history_lines=history_lines, if not focus_tags and subjective:
state=state, focus_tags = set(_ALLOWED_INSIGHT_TAGS)
avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
limit = 12 if normalized == "smart" else 18
selected_lines = _fast_fact_lines(
fact_lines,
fact_meta,
focus_tags=focus_tags,
avoid_tags=avoid_tags,
primary_tags=primary_tags,
limit=limit,
) )
selected_meta = _fact_pack_meta(selected_lines)
selected_pack = _fact_pack_text(selected_lines, selected_meta)
if _needs_full_fact_pack(prompt) or not selected_lines or normalized == "genius":
selected_pack = fact_pack
fallback = _fallback_fact_answer(prompt, selected_pack)
model_context = _merge_context_blocks(selected_pack, extra_context)
if not subjective and fallback:
if state:
state.update("done", step=_open_ended_total_steps(normalized))
return _ensure_scores(fallback)
if state:
state.update("drafting", step=1, note="synthesizing")
reply = _ollama_call_safe(
("atlasbot_deep", "atlasbot_deep"),
prompt,
context=_append_history_context(model_context, history_lines),
fallback="",
system_override=_open_ended_system(),
model=model,
timeout=_mode_ollama_timeout_sec(normalized),
)
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
reply = fallback
if not _has_body_lines(reply):
reply = "I don't have enough data in the current snapshot to answer that."
if state:
state.update("done", step=_open_ended_total_steps(normalized))
return _ensure_scores(reply)
def open_ended_answer( def open_ended_answer(
@ -4240,6 +4382,7 @@ def open_ended_answer(
history_lines: list[str], history_lines: list[str],
mode: str, mode: str,
allow_tools: bool, allow_tools: bool,
context: str = "",
state: ThoughtState | None = None, state: ThoughtState | None = None,
) -> str: ) -> str:
lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
@ -4256,13 +4399,15 @@ def open_ended_answer(
return _ensure_scores("I don't have enough data to answer that.") return _ensure_scores("I don't have enough data to answer that.")
fact_meta = _fact_pack_meta(lines) fact_meta = _fact_pack_meta(lines)
fact_pack = _fact_pack_text(lines, fact_meta) fact_pack = _fact_pack_text(lines, fact_meta)
if mode == "fast": normalized = _normalize_mode(mode)
if normalized == "fast":
return _open_ended_fast( return _open_ended_fast(
prompt, prompt,
fact_pack=fact_pack, fact_pack=fact_pack,
fact_lines=lines, fact_lines=lines,
fact_meta=fact_meta, fact_meta=fact_meta,
history_lines=history_lines, history_lines=history_lines,
extra_context=context,
state=state, state=state,
) )
return _open_ended_deep( return _open_ended_deep(
@ -4271,6 +4416,8 @@ def open_ended_answer(
fact_lines=lines, fact_lines=lines,
fact_meta=fact_meta, fact_meta=fact_meta,
history_lines=history_lines, history_lines=history_lines,
extra_context=context,
mode=normalized,
state=state, state=state,
) )
@ -4292,6 +4439,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
use_history=False, use_history=False,
system_override=system, system_override=system,
model=model, model=model,
timeout=_mode_ollama_timeout_sec(mode),
) )
reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip() reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
return _ensure_scores(reply) return _ensure_scores(reply)
@ -4343,13 +4491,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
self._write_json(400, {"error": "missing_prompt"}) self._write_json(400, {"error": "missing_prompt"})
return return
cleaned = _strip_bot_mention(prompt) cleaned = _strip_bot_mention(prompt)
mode = str(payload.get("mode") or "deep").lower() mode = _normalize_mode(str(payload.get("mode") or "smart"))
if mode in ("quick", "fast"):
mode = "fast"
elif mode in ("smart", "deep"):
mode = "deep"
else:
mode = "deep"
snapshot = _snapshot_state() snapshot = _snapshot_state()
inventory = _snapshot_inventory(snapshot) or node_inventory_live() inventory = _snapshot_inventory(snapshot) or node_inventory_live()
workloads = _snapshot_workloads(snapshot) workloads = _snapshot_workloads(snapshot)
@ -4386,6 +4528,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
history_lines=history_lines, history_lines=history_lines,
mode=mode, mode=mode,
allow_tools=True, allow_tools=True,
context=context,
state=None, state=None,
) )
else: else:
@ -4640,6 +4783,7 @@ def _ollama_call(
use_history: bool = True, use_history: bool = True,
system_override: str | None = None, system_override: str | None = None,
model: str | None = None, model: str | None = None,
timeout: float | None = None,
) -> str: ) -> str:
system = system_override or ( system = system_override or (
"System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@ -4673,6 +4817,7 @@ def _ollama_call(
messages.append({"role": "user", "content": prompt}) messages.append({"role": "user", "content": prompt})
model_name = model or MODEL model_name = model or MODEL
request_timeout = timeout if timeout is not None else OLLAMA_TIMEOUT_SEC
payload = {"model": model_name, "messages": messages, "stream": False} payload = {"model": model_name, "messages": messages, "stream": False}
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
if API_KEY: if API_KEY:
@ -4683,13 +4828,13 @@ def _ollama_call(
lock.acquire() lock.acquire()
try: try:
try: try:
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: with request.urlopen(r, timeout=request_timeout) as resp:
data = json.loads(resp.read().decode()) data = json.loads(resp.read().decode())
except error.HTTPError as exc: except error.HTTPError as exc:
if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]: if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
payload["model"] = FALLBACK_MODEL payload["model"] = FALLBACK_MODEL
r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: with request.urlopen(r, timeout=request_timeout) as resp:
data = json.loads(resp.read().decode()) data = json.loads(resp.read().decode())
else: else:
raise raise
@ -4714,6 +4859,7 @@ def ollama_reply(
fallback: str = "", fallback: str = "",
use_history: bool = True, use_history: bool = True,
model: str | None = None, model: str | None = None,
timeout: float | None = None,
) -> str: ) -> str:
last_error = None last_error = None
for attempt in range(max(1, OLLAMA_RETRIES + 1)): for attempt in range(max(1, OLLAMA_RETRIES + 1)):
@ -4724,6 +4870,7 @@ def ollama_reply(
context=context, context=context,
use_history=use_history, use_history=use_history,
model=model, model=model,
timeout=timeout,
) )
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
last_error = exc last_error = exc
@ -4744,11 +4891,13 @@ def ollama_reply_with_thinking(
fallback: str, fallback: str,
use_history: bool = True, use_history: bool = True,
model: str | None = None, model: str | None = None,
timeout: float | None = None,
) -> str: ) -> str:
result: dict[str, str] = {"reply": ""} result: dict[str, str] = {"reply": ""}
done = threading.Event() done = threading.Event()
def worker(): def worker():
try:
result["reply"] = ollama_reply( result["reply"] = ollama_reply(
hist_key, hist_key,
prompt, prompt,
@ -4756,7 +4905,9 @@ def ollama_reply_with_thinking(
fallback=fallback, fallback=fallback,
use_history=use_history, use_history=use_history,
model=model, model=model,
timeout=timeout,
) )
finally:
done.set() done.set()
thread = threading.Thread(target=worker, daemon=True) thread = threading.Thread(target=worker, daemon=True)
@ -4789,6 +4940,7 @@ def open_ended_with_thinking(
history_lines: list[str], history_lines: list[str],
mode: str, mode: str,
allow_tools: bool, allow_tools: bool,
context: str = "",
) -> str: ) -> str:
result: dict[str, str] = {"reply": ""} result: dict[str, str] = {"reply": ""}
done = threading.Event() done = threading.Event()
@ -4796,6 +4948,7 @@ def open_ended_with_thinking(
state = ThoughtState(total_steps=total_steps) state = ThoughtState(total_steps=total_steps)
def worker(): def worker():
try:
result["reply"] = open_ended_answer( result["reply"] = open_ended_answer(
prompt, prompt,
inventory=inventory, inventory=inventory,
@ -4804,15 +4957,17 @@ def open_ended_with_thinking(
history_lines=history_lines, history_lines=history_lines,
mode=mode, mode=mode,
allow_tools=allow_tools, allow_tools=allow_tools,
context=context,
state=state, state=state,
) )
finally:
done.set() done.set()
thread = threading.Thread(target=worker, daemon=True) thread = threading.Thread(target=worker, daemon=True)
thread.start() thread.start()
if not done.wait(2.0): if not done.wait(2.0):
send_msg(token, room, "Thinking…") send_msg(token, room, "Thinking…")
heartbeat = max(10, THINKING_INTERVAL_SEC) heartbeat = _mode_heartbeat_sec(mode)
next_heartbeat = time.monotonic() + heartbeat next_heartbeat = time.monotonic() + heartbeat
while not done.wait(max(0, next_heartbeat - time.monotonic())): while not done.wait(max(0, next_heartbeat - time.monotonic())):
send_msg(token, room, state.status_line()) send_msg(token, room, state.status_line())
@ -4820,7 +4975,7 @@ def open_ended_with_thinking(
thread.join(timeout=1) thread.join(timeout=1)
return result["reply"] or "Model backend is busy. Try again in a moment." return result["reply"] or "Model backend is busy. Try again in a moment."
def sync_loop(token: str, room_id: str): def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
since = None since = None
try: try:
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10) res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4861,7 +5016,7 @@ def sync_loop(token: str, room_id: str):
if not body: if not body:
continue continue
sender = ev.get("sender", "") sender = ev.get("sender", "")
if sender == f"@{USER}:live.bstein.dev": if account_user and sender == normalize_user_id(account_user):
continue continue
mentioned = is_mentioned(content, body) mentioned = is_mentioned(content, body)
@ -4874,7 +5029,12 @@ def sync_loop(token: str, room_id: str):
cleaned_body = _strip_bot_mention(body) cleaned_body = _strip_bot_mention(body)
lower_body = cleaned_body.lower() lower_body = cleaned_body.lower()
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep") mode = _detect_mode(
content,
body,
default=_normalize_mode(default_mode),
account_user=account_user,
)
# Only do live cluster introspection in DMs. # Only do live cluster introspection in DMs.
allow_tools = is_dm allow_tools = is_dm
@ -4938,39 +5098,81 @@ def sync_loop(token: str, room_id: str):
snapshot=snapshot, snapshot=snapshot,
workloads=workloads, workloads=workloads,
history_lines=history[hist_key], history_lines=history[hist_key],
mode=mode if mode in ("fast", "deep") else "deep", mode=_normalize_mode(mode),
allow_tools=allow_tools, allow_tools=allow_tools,
context=context,
) )
else: else:
reply = _non_cluster_reply( reply = _non_cluster_reply(
cleaned_body, cleaned_body,
history_lines=history[hist_key], history_lines=history[hist_key],
mode=mode if mode in ("fast", "deep") else "deep", mode=_normalize_mode(mode),
) )
send_msg(token, rid, reply) send_msg(token, rid, reply)
history[hist_key].append(f"Atlas: {reply}") history[hist_key].append(f"Atlas: {reply}")
history[hist_key] = history[hist_key][-80:] history[hist_key] = history[hist_key][-80:]
def login_with_retry(): def login_with_retry(user: str, password: str):
last_err = None last_err = None
for attempt in range(10): for attempt in range(10):
try: try:
return login() return login(user, password)
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
last_err = exc last_err = exc
time.sleep(min(30, 2 ** attempt)) time.sleep(min(30, 2 ** attempt))
raise last_err raise last_err
def _bot_accounts() -> list[dict[str, str]]:
accounts: list[dict[str, str]] = []
def add(user: str, password: str, mode: str):
if not user or not password:
return
accounts.append({"user": user, "password": password, "mode": mode})
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "smart")
if BOT_USER_QUICK and BOT_PASS_QUICK:
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
if BOT_USER_GENIUS and BOT_PASS_GENIUS:
add(BOT_USER_GENIUS, BOT_PASS_GENIUS, "genius")
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
add(BOT_USER, BOT_PASS, "smart")
seen: set[str] = set()
unique: list[dict[str, str]] = []
for acc in accounts:
uid = normalize_user_id(acc["user"]).lower()
if uid in seen:
continue
seen.add(uid)
unique.append(acc)
return unique
def main(): def main():
load_kb() load_kb()
_start_http_server() _start_http_server()
token = login_with_retry() accounts = _bot_accounts()
threads: list[threading.Thread] = []
for acc in accounts:
token = login_with_retry(acc["user"], acc["password"])
try: try:
room_id = resolve_alias(token, ROOM_ALIAS) room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id) join_room(token, room_id)
except Exception: except Exception:
room_id = None room_id = None
sync_loop(token, room_id) thread = threading.Thread(
target=sync_loop,
args=(token, room_id),
kwargs={
"account_user": acc["user"],
"default_mode": acc["mode"],
},
daemon=True,
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -7,6 +7,14 @@ read_secret() {
tr -d '\r\n' < "${vault_dir}/$1" tr -d '\r\n' < "${vault_dir}/$1"
} }
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)" export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}" export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}" export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)" export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)" export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)" export CHAT_API_KEY="$(read_secret chat-matrix)"

View File

@ -0,0 +1,164 @@
from __future__ import annotations
import importlib.util
import os
from pathlib import Path
from unittest import TestCase, mock
BOT_PATH = Path(__file__).resolve().parents[1] / "atlasbot" / "bot.py"
def load_bot_module():
env = {
"BOT_USER": "atlas-smart",
"BOT_PASS": "smart-pass",
"BOT_USER_QUICK": "atlas-quick",
"BOT_PASS_QUICK": "quick-pass",
"BOT_USER_SMART": "atlas-smart",
"BOT_PASS_SMART": "smart-pass",
"BOT_USER_GENIUS": "atlas-genius",
"BOT_PASS_GENIUS": "genius-pass",
"OLLAMA_URL": "http://ollama.invalid",
"OLLAMA_MODEL": "base-model",
"ATLASBOT_MODEL_FAST": "fast-model",
"ATLASBOT_MODEL_SMART": "smart-model",
"ATLASBOT_MODEL_GENIUS": "genius-model",
"ATLASBOT_QUICK_TIME_BUDGET_SEC": "15",
"ATLASBOT_SMART_TIME_BUDGET_SEC": "45",
"ATLASBOT_GENIUS_TIME_BUDGET_SEC": "180",
"KB_DIR": "",
"VM_URL": "http://vm.invalid",
"ARIADNE_STATE_URL": "",
"ARIADNE_STATE_TOKEN": "",
}
with mock.patch.dict(os.environ, env, clear=False):
spec = importlib.util.spec_from_file_location("atlasbot_bot", BOT_PATH)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
class AtlasbotModeTests(TestCase):
def setUp(self):
self.bot = load_bot_module()
def test_bot_accounts_include_genius_mode(self):
accounts = self.bot._bot_accounts()
by_user = {account["user"]: account["mode"] for account in accounts}
self.assertEqual(by_user["atlas-quick"], "fast")
self.assertEqual(by_user["atlas-smart"], "smart")
self.assertEqual(by_user["atlas-genius"], "genius")
def test_objective_cluster_question_uses_fact_pack_without_llm(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"hottest_ram: longhorn-system (36.05 GB)",
]
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
):
reply = self.bot.open_ended_answer(
"what is the hottest cpu node in titan lab currently?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="smart",
allow_tools=True,
)
self.assertIn("longhorn-system", reply)
self.assertIn("Confidence:", reply)
def test_subjective_genius_answer_uses_genius_model(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"worker_nodes: titan-01, titan-02, titan-03",
]
captured: dict[str, object] = {}
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
captured["model"] = model
captured["timeout"] = timeout
captured["context"] = context
return "The worker spread stands out because Titan keeps meaningful capacity on the same cluster. Confidence: high"
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
):
reply = self.bot.open_ended_answer(
"what stands out about titan lab?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="genius",
allow_tools=True,
context='Cluster snapshot (JSON): {"injected":true}',
)
self.assertIn("The worker spread stands out", reply)
self.assertEqual(captured["model"], "genius-model")
self.assertLessEqual(float(captured["timeout"]), 180.0)
self.assertIn('Cluster snapshot (JSON): {"injected":true}', str(captured["context"]))
def test_mode_timeouts_stay_within_budgets(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"worker_nodes: titan-01, titan-02, titan-03",
]
seen: list[tuple[str, float]] = []
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
seen.append((str(model), float(timeout or 0)))
return "Atlas has a clear standout because the worker spread is healthy. Confidence: high"
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
):
for mode in ("fast", "smart", "genius"):
reply = self.bot.open_ended_answer(
"what stands out about titan lab?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode=mode,
allow_tools=True,
)
self.assertIn("Confidence:", reply)
self.assertEqual([model for model, _ in seen], ["fast-model", "smart-model", "genius-model"])
self.assertLessEqual(seen[0][1], 15.0)
self.assertLessEqual(seen[1][1], 45.0)
self.assertLessEqual(seen[2][1], 180.0)
def test_llm_timeout_still_returns_a_conclusion(self):
fact_lines = [
"worker_nodes: titan-01, titan-02, titan-03",
"hottest_cpu: longhorn-system (6.69)",
]
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=TimeoutError("simulated timeout")),
):
reply = self.bot.open_ended_answer(
"what stands out about the worker nodes?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="genius",
allow_tools=True,
)
self.assertIn("worker nodes", reply.lower())
self.assertIn("Confidence:", reply)

View File

@ -66,7 +66,7 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
command: command:
- /bin/sh - /bin/sh
- -c - -c

View File

@ -29,12 +29,18 @@ spec:
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi4","rpi5"]
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50 - weight: 80
preference: preference:
matchExpressions: matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4"] values: ["rpi5"]
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
containers: containers:
- name: monerod - name: monerod
image: registry.bstein.dev/crypto/monerod:0.18.4.1 image: registry.bstein.dev/crypto/monerod:0.18.4.1

View File

@ -23,7 +23,7 @@ spec:
- matchExpressions: - matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi5"]
containers: containers:
- name: xmrig - name: xmrig
image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9

View File

@ -123,13 +123,22 @@ spec:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi4","rpi5"]
- key: longhorn
operator: NotIn
values: ["true"]
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13","titan-15","titan-17","titan-19"]
- weight: 50 - weight: 50
preference: preference:
matchExpressions: matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4"] values: ["rpi5"]
containers: containers:
- name: gitea - name: gitea
image: gitea/gitea:1.23 image: gitea/gitea:1.23

View File

@ -245,6 +245,17 @@ spec:
image: image:
repository: registry.bstein.dev/infra/harbor-registry repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"} tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
extraEnvVars:
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
controller: controller:
image: image:
repository: registry.bstein.dev/infra/harbor-registryctl repository: registry.bstein.dev/infra/harbor-registryctl
@ -263,6 +274,10 @@ spec:
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}" export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}" export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
{{ end }} {{ end }}
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry" vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: | vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }} {{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -397,10 +412,10 @@ spec:
patch: |- patch: |-
- op: replace - op: replace
path: /spec/rules/0/http/paths/2/backend/service/name path: /spec/rules/0/http/paths/2/backend/service/name
value: harbor-registry value: harbor-core
- op: replace - op: replace
path: /spec/rules/0/http/paths/2/backend/service/port/number path: /spec/rules/0/http/paths/2/backend/service/port/number
value: 5000 value: 80
- target: - target:
kind: Deployment kind: Deployment
name: harbor-jobservice name: harbor-jobservice
@ -422,8 +437,7 @@ spec:
- $patch: replace - $patch: replace
- name: VAULT_ENV_FILE - name: VAULT_ENV_FILE
value: /vault/secrets/harbor-jobservice-env.sh value: /vault/secrets/harbor-jobservice-env.sh
envFrom: envFrom: []
- $patch: replace
- configMapRef: - configMapRef:
name: harbor-jobservice-env name: harbor-jobservice-env
volumeMounts: volumeMounts:
@ -464,6 +478,16 @@ spec:
value: /vault/secrets/harbor-registry-env.sh value: /vault/secrets/harbor-registry-env.sh
- name: VAULT_COPY_FILES - name: VAULT_COPY_FILES
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
envFrom: envFrom:
- $patch: replace - $patch: replace
volumeMounts: volumeMounts:

View File

@ -67,7 +67,7 @@ data:
url('https://scm.bstein.dev/bstein/harbor-arm-build.git') url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
credentials('gitea-pat') credentials('gitea-pat')
} }
branches('*/master') branches('*/main')
} }
} }
} }
@ -108,7 +108,7 @@ data:
url('https://scm.bstein.dev/bstein/ci-demo.git') url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat') credentials('gitea-pat')
} }
branches('*/master') branches('*/main')
} }
} }
scriptPath('Jenkinsfile') scriptPath('Jenkinsfile')
@ -167,6 +167,110 @@ data:
} }
} }
} }
pipelineJob('metis') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/metis.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('metis') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/metis.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('atlasbot') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/atlasbot.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('Soteria') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/soteria.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('data-prepper') { pipelineJob('data-prepper') {
properties { properties {
pipelineTriggers { pipelineTriggers {

View File

@ -48,7 +48,7 @@ spec:
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }} GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }} {{ end }}
bstein.dev/restarted-at: "2026-01-20T14:52:41Z" bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
spec: spec:
serviceAccountName: jenkins serviceAccountName: jenkins
nodeSelector: nodeSelector:

View File

@ -0,0 +1,13 @@
# services/jenkins/dind-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-dind-cache
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: astreae

View File

@ -8,6 +8,7 @@ resources:
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- pvc.yaml - pvc.yaml
- cache-pvc.yaml - cache-pvc.yaml
- dind-pvc.yaml
- plugins-pvc.yaml - plugins-pvc.yaml
- configmap-jcasc.yaml - configmap-jcasc.yaml
- configmap-plugins.yaml - configmap-plugins.yaml

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml # services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14. # One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file). # Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: keycloak-portal-e2e-execute-actions-email-14 name: keycloak-portal-e2e-execute-actions-email-18
namespace: sso namespace: sso
spec: spec:
suspend: true suspend: true
@ -70,7 +70,7 @@ spec:
- name: E2E_PROBE_USERNAME - name: E2E_PROBE_USERNAME
value: robotuser value: robotuser
- name: E2E_PROBE_EMAIL - name: E2E_PROBE_EMAIL
value: robotuser@bstein.dev value: brad.stein+robot@gmail.com
- name: EXECUTE_ACTIONS_CLIENT_ID - name: EXECUTE_ACTIONS_CLIENT_ID
value: bstein-dev-home value: bstein-dev-home
- name: EXECUTE_ACTIONS_REDIRECT_URI - name: EXECUTE_ACTIONS_REDIRECT_URI

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/realm-settings-job.yaml # services/keycloak/oneoffs/realm-settings-job.yaml
# One-off job for sso/keycloak-realm-settings-36. # One-off job for sso/keycloak-realm-settings-38.
# Purpose: keycloak realm settings 36 (see container args/env in this file). # Purpose: keycloak realm settings 38 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: keycloak-realm-settings-36 name: keycloak-realm-settings-38
namespace: sso namespace: sso
spec: spec:
suspend: true suspend: true
@ -64,7 +64,7 @@ spec:
- name: KEYCLOAK_REALM - name: KEYCLOAK_REALM
value: atlas value: atlas
- name: KEYCLOAK_SMTP_HOST - name: KEYCLOAK_SMTP_HOST
value: mail.bstein.dev value: smtp.postmarkapp.com
- name: KEYCLOAK_SMTP_PORT - name: KEYCLOAK_SMTP_PORT
value: "587" value: "587"
- name: KEYCLOAK_SMTP_FROM - name: KEYCLOAK_SMTP_FROM

View File

@ -18,6 +18,7 @@ spec:
prometheus.io/scrape: "true" prometheus.io/scrape: "true"
prometheus.io/port: "8080" prometheus.io/port: "8080"
prometheus.io/path: "/metrics" prometheus.io/path: "/metrics"
maintenance.bstein.dev/restart-rev: "20260207-2"
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance" vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"
containers: containers:
- name: ariadne - name: ariadne
image: registry.bstein.dev/bstein/ariadne:0.1.0-0 image: registry.bstein.dev/bstein/ariadne:latest
imagePullPolicy: Always imagePullPolicy: Always
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
@ -285,7 +286,7 @@ spec:
- name: ARIADNE_SCHEDULE_MAILU_SYNC - name: ARIADNE_SCHEDULE_MAILU_SYNC
value: "30 4 * * *" value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
value: "0 5 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
value: "*/5 * * * *" value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,23 +294,23 @@ spec:
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
value: "0 * * * *" value: "0 * * * *"
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
value: "0 5 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_WGER_ADMIN - name: ARIADNE_SCHEDULE_WGER_ADMIN
value: "15 3 * * *" value: "15 3 * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
value: "0 6 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_CRON - name: ARIADNE_SCHEDULE_FIREFLY_CRON
value: "0 3 * * *" value: "0 3 * * *"
- name: ARIADNE_SCHEDULE_POD_CLEANER - name: ARIADNE_SCHEDULE_POD_CLEANER
value: "0 * * * *" value: "*/30 * * * *"
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
value: "23 3 * * *" value: "23 3 * * *"
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
value: "30 4 * * 0" value: "0 */4 * * *"
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
value: "0 * * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_VAULT_OIDC - name: ARIADNE_SCHEDULE_VAULT_OIDC
value: "0 * * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
value: "*/5 * * * *" value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -319,9 +320,9 @@ spec:
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
value: "*/10 * * * *" value: "*/10 * * * *"
- name: ARIADNE_SCHEDULE_CLUSTER_STATE - name: ARIADNE_SCHEDULE_CLUSTER_STATE
value: "*/15 * * * *" value: "*/10 * * * *"
- name: ARIADNE_CLUSTER_STATE_KEEP - name: ARIADNE_CLUSTER_STATE_KEEP
value: "168" value: "720"
- name: WELCOME_EMAIL_ENABLED - name: WELCOME_EMAIL_ENABLED
value: "true" value: "true"
- name: K8S_API_TIMEOUT_SEC - name: K8S_API_TIMEOUT_SEC
@ -330,12 +331,20 @@ spec:
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
value: "5" value: "5"
- name: ARIADNE_ALERTMANAGER_URL
value: http://alertmanager.monitoring.svc.cluster.local
- name: OPENSEARCH_URL - name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200 value: http://opensearch-master.logging.svc.cluster.local:9200
- name: OPENSEARCH_LIMIT_BYTES - name: OPENSEARCH_LIMIT_BYTES
value: "1099511627776" value: "1099511627776"
- name: OPENSEARCH_INDEX_PATTERNS - name: OPENSEARCH_INDEX_PATTERNS
value: kube-*,journald-*,trace-analytics-* value: kube-*,journald-*,trace-analytics-*
- name: METIS_BASE_URL
value: http://metis.maintenance.svc.cluster.local
- name: METIS_TIMEOUT_SEC
value: "15"
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
value: "*/30 * * * *"
- name: METRICS_PATH - name: METRICS_PATH
value: "/metrics" value: "/metrics"
resources: resources:

View File

@ -29,6 +29,29 @@ rules:
- get - get
- list - list
- watch - watch
- apiGroups: ["apps"]
resources:
- deployments
- statefulsets
- daemonsets
verbs:
- get
- list
- watch
- apiGroups: ["longhorn.io"]
resources:
- volumes
verbs:
- get
- list
- watch
- apiGroups: [""]
resources:
- events
verbs:
- get
- list
- watch
- apiGroups: [""] - apiGroups: [""]
resources: resources:
- pods/exec - pods/exec
@ -56,3 +79,17 @@ roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: ClusterRole kind: ClusterRole
name: ariadne-job-spawner name: ariadne-job-spawner
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: ariadne-auth-delegator
subjects:
- kind: ServiceAccount
name: ariadne
namespace: maintenance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator

View File

@ -21,3 +21,72 @@ spec:
policy: policy:
semver: semver:
range: ">=0.1.0-0" range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: metis
namespace: maintenance
spec:
image: registry.bstein.dev/bstein/metis
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis
namespace: maintenance
spec:
imageRepositoryRef:
name: metis
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: metis-sentinel
namespace: maintenance
spec:
image: registry.bstein.dev/bstein/metis-sentinel
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis-sentinel
namespace: maintenance
spec:
imageRepositoryRef:
name: metis-sentinel
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: soteria
namespace: maintenance
spec:
image: registry.bstein.dev/bstein/soteria
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: soteria
namespace: maintenance
spec:
imageRepositoryRef:
name: soteria
policy:
semver:
range: ">=0.1.0-0"

View File

@ -5,28 +5,50 @@ resources:
- namespace.yaml - namespace.yaml
- image.yaml - image.yaml
- secretproviderclass.yaml - secretproviderclass.yaml
- soteria-configmap.yaml
- metis-configmap.yaml
- metis-data-pvc.yaml
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- ariadne-serviceaccount.yaml - ariadne-serviceaccount.yaml
- ariadne-rbac.yaml - ariadne-rbac.yaml
- disable-k3s-traefik-serviceaccount.yaml - disable-k3s-traefik-serviceaccount.yaml
- k3s-traefik-cleanup-rbac.yaml - k3s-traefik-cleanup-rbac.yaml
- metis-serviceaccount.yaml
- metis-rbac.yaml
- metis-token-sync-serviceaccount.yaml
- metis-token-sync-rbac.yaml
- node-nofile-serviceaccount.yaml - node-nofile-serviceaccount.yaml
- pod-cleaner-rbac.yaml - pod-cleaner-rbac.yaml
- soteria-serviceaccount.yaml
- soteria-rbac.yaml
- ariadne-deployment.yaml - ariadne-deployment.yaml
- metis-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml - oneoffs/ariadne-migrate-job.yaml
- ariadne-service.yaml - ariadne-service.yaml
- soteria-deployment.yaml
- disable-k3s-traefik-daemonset.yaml - disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml - oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml - node-nofile-daemonset.yaml
- metis-sentinel-daemonset.yaml
- metis-k3s-token-sync-cronjob.yaml
- k3s-agent-restart-daemonset.yaml - k3s-agent-restart-daemonset.yaml
- pod-cleaner-cronjob.yaml - pod-cleaner-cronjob.yaml
- node-image-sweeper-serviceaccount.yaml - node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml - node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml - image-sweeper-cronjob.yaml
- metis-service.yaml
- metis-ingress.yaml
- soteria-service.yaml
images: images:
- name: registry.bstein.dev/bstein/ariadne - name: registry.bstein.dev/bstein/ariadne
newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"} newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis:tag"}
- name: registry.bstein.dev/bstein/metis-sentinel
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis-sentinel:tag"}
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator: configMapGenerator:
- name: disable-k3s-traefik-script - name: disable-k3s-traefik-script
namespace: maintenance namespace: maintenance

View File

@ -0,0 +1,20 @@
# services/maintenance/metis-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: metis
namespace: maintenance
data:
METIS_BIND_ADDR: :8080
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
METIS_DATA_DIR: /var/lib/metis
METIS_DEFAULT_FLASH_HOST: titan-22
METIS_FLASH_HOSTS: titan-22
METIS_LOCAL_HOST: titan-22
METIS_ALLOWED_GROUPS: admin,maintainer
METIS_MAX_DEVICE_BYTES: "300000000000"
METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
METIS_SENTINEL_INTERVAL_SEC: "1800"
METIS_SENTINEL_NSENTER: "1"
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46

View File

@ -0,0 +1,13 @@
# services/maintenance/metis-data-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: metis-data
namespace: maintenance
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 40Gi
storageClassName: local-path

View File

@ -0,0 +1,47 @@
# services/maintenance/metis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: metis
namespace: maintenance
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: metis
template:
metadata:
labels:
app: metis
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: metis
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
node-role.kubernetes.io/worker: "true"
containers:
- name: metis
image: registry.bstein.dev/bstein/metis:latest
imagePullPolicy: Always
envFrom:
- configMapRef:
name: metis
ports:
- name: http
containerPort: 8080
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]

View File

@ -0,0 +1,27 @@
# services/maintenance/metis-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: metis
namespace: maintenance
annotations:
kubernetes.io/ingress.class: traefik
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-forward-auth@kubernetescrd
spec:
tls:
- hosts: ["metis.bstein.dev"]
secretName: metis-tls
rules:
- host: metis.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: metis
port:
number: 80

View File

@ -0,0 +1,51 @@
# services/maintenance/metis-k3s-token-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: metis-k3s-token-sync
namespace: maintenance
spec:
schedule: "11 */6 * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 2
jobTemplate:
spec:
template:
spec:
serviceAccountName: metis-token-sync
restartPolicy: OnFailure
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
containers:
- name: sync
image: registry.bstein.dev/bstein/kubectl:1.35.0
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -euo pipefail
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)"
kubectl -n maintenance create secret generic metis-runtime \
--from-literal=k3s_token="${token}" \
--dry-run=client -o yaml | kubectl apply -f -
securityContext:
runAsUser: 0
volumeMounts:
- name: k3s-server
mountPath: /host/var/lib/rancher/k3s/server
readOnly: true
volumes:
- name: k3s-server
hostPath:
path: /var/lib/rancher/k3s/server

View File

@ -0,0 +1,27 @@
# services/maintenance/metis-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: metis-node-manager
rules:
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- watch
- delete
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: metis-node-manager
subjects:
- kind: ServiceAccount
name: metis
namespace: maintenance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: metis-node-manager

View File

@ -0,0 +1,133 @@
# services/maintenance/metis-sentinel-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: metis-sentinel
namespace: maintenance
spec:
selector:
matchLabels:
app: metis-sentinel
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: metis-sentinel
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: metis
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/worker: "true"
containers:
- name: metis-sentinel
image: registry.bstein.dev/bstein/metis-sentinel:latest
imagePullPolicy: Always
command:
- /bin/sh
- -c
args:
- |
set -eu
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
interval="${METIS_SENTINEL_INTERVAL_SEC:-120}"
mkdir -p "${out_dir}"
while true; do
ts="$(date -u +%Y%m%dT%H%M%SZ)"
node="${METIS_SENTINEL_NODE:-unknown}"
tmp="${out_dir}/${node}-${ts}.json.tmp"
out="${out_dir}/${node}-${ts}.json"
if metis-sentinel > "${tmp}"; then
mv "${tmp}" "${out}"
else
rm -f "${tmp}" || true
fi
sleep "${interval}"
done
envFrom:
- configMapRef:
name: metis
env:
- name: METIS_SENTINEL_NODE
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- name: http
containerPort: 8080
volumeMounts:
- name: sentinel-output
mountPath: /var/run/metis-sentinel
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 250m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
runAsUser: 0
capabilities:
drop: ["ALL"]
- name: sentinel-pusher
image: curlimages/curl:8.12.1
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -eu
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
push_url="${METIS_SENTINEL_PUSH_URL:-}"
interval="${METIS_SENTINEL_PUSH_INTERVAL_SEC:-120}"
timeout="${METIS_SENTINEL_PUSH_TIMEOUT_SEC:-10}"
mkdir -p "${out_dir}"
while true; do
for snapshot in "${out_dir}"/*.json; do
[ -f "${snapshot}" ] || continue
if [ -z "${push_url}" ]; then
break
fi
if curl -fsS --connect-timeout "${timeout}" --max-time "${timeout}" \
-X POST \
-H "Content-Type: application/json" \
-H "X-Metis-Node: ${METIS_SENTINEL_NODE:-unknown}" \
--data-binary "@${snapshot}" \
"${push_url}"; then
rm -f "${snapshot}"
fi
done
sleep "${interval}"
done
envFrom:
- configMapRef:
name: metis
env:
- name: METIS_SENTINEL_NODE
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: sentinel-output
mountPath: /var/run/metis-sentinel
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 128Mi
securityContext:
allowPrivilegeEscalation: false
runAsUser: 0
capabilities:
drop: ["ALL"]
volumes:
- name: sentinel-output
emptyDir: {}

View File

@ -0,0 +1,18 @@
# services/maintenance/metis-service.yaml
apiVersion: v1
kind: Service
metadata:
name: metis
namespace: maintenance
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
selector:
app: metis
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,6 @@
# services/maintenance/metis-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: metis
namespace: maintenance

View File

@ -0,0 +1,30 @@
# services/maintenance/metis-token-sync-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: metis-token-sync
namespace: maintenance
rules:
- apiGroups: [""]
resources:
- secrets
verbs:
- get
- list
- create
- update
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: metis-token-sync
namespace: maintenance
subjects:
- kind: ServiceAccount
name: metis-token-sync
namespace: maintenance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: metis-token-sync

View File

@ -0,0 +1,6 @@
# services/maintenance/metis-token-sync-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: metis-token-sync
namespace: maintenance

View File

@ -10,6 +10,8 @@ spec:
app: node-image-sweeper app: node-image-sweeper
updateStrategy: updateStrategy:
type: RollingUpdate type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
template: template:
metadata: metadata:
labels: labels:
@ -29,6 +31,21 @@ spec:
- name: node-image-sweeper - name: node-image-sweeper
image: python:3.12.9-alpine3.20 image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext: securityContext:
privileged: true privileged: true
runAsUser: 0 runAsUser: 0

View File

@ -2,26 +2,39 @@
set -eu set -eu
ONE_SHOT=${ONE_SHOT:-false} ONE_SHOT=${ONE_SHOT:-false}
THRESHOLD_DAYS=14 SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" sweep_once() {
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
THRESHOLD_DAYS=3 threshold_days="${BASE_THRESHOLD_DAYS}"
fi if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
fi
cutoff=$(python3 - <<'PY' cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
import time, os import os
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400) import time
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
PY PY
) )
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' import os
import json, os, sys, time import sys
import time
try: try:
data = json.load(sys.stdin) data = json.load(sys.stdin)
@ -74,19 +87,33 @@ for p in prune:
PY PY
) )
if [ -n "${prune_list}" ]; then if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi fi
done done
fi fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
sweep_once
if [ "${ONE_SHOT}" = "true" ]; then if [ "${ONE_SHOT}" = "true" ]; then
exit 0 exit 0
fi fi
sleep infinity while true; do
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done

View File

@ -0,0 +1,10 @@
# services/maintenance/soteria-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: "longhorn"
SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
SOTERIA_LONGHORN_BACKUP_MODE: "incremental"

View File

@ -0,0 +1,73 @@
# services/maintenance/soteria-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: soteria
namespace: maintenance
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: soteria
template:
metadata:
labels:
app: soteria
spec:
serviceAccountName: soteria
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: soteria
image: registry.bstein.dev/bstein/soteria:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
envFrom:
- configMapRef:
name: soteria
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
periodSeconds: 5
timeoutSeconds: 2
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 65532
capabilities:
drop: ["ALL"]

View File

@ -0,0 +1,22 @@
# services/maintenance/soteria-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: soteria
rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims", "persistentvolumes"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: soteria
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: soteria
subjects:
- kind: ServiceAccount
name: soteria
namespace: maintenance

View File

@ -0,0 +1,14 @@
# services/maintenance/soteria-service.yaml
apiVersion: v1
kind: Service
metadata:
name: soteria
namespace: maintenance
spec:
type: ClusterIP
selector:
app: soteria
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,8 @@
# services/maintenance/soteria-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: soteria
namespace: maintenance
imagePullSecrets:
- name: harbor-regcred

View File

@ -20,7 +20,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -89,7 +89,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -1125,7 +1125,7 @@
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne CI Coverage (%)", "title": "Platform CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1183,12 +1183,13 @@
"values": false "values": false
}, },
"textMode": "value" "textMode": "value"
} },
"description": "Internal source panel for Atlas Overview automation test rollups."
}, },
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne CI Tests (latest)", "title": "Platform CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1201,7 +1202,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1233,7 +1234,8 @@
"order": "desc" "order": "desc"
} }
} }
] ],
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
} }
], ],
"time": { "time": {

View File

@ -1677,7 +1677,7 @@
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne Test Success Rate", "title": "Platform Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1709,12 +1709,13 @@
"tooltip": { "tooltip": {
"mode": "multi" "mode": "multi"
} }
} },
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
}, },
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Tests with Failures (24h)", "title": "Platform Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1727,7 +1728,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true
@ -1814,7 +1815,8 @@
"order": "desc" "order": "desc"
} }
} }
] ],
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
}, },
{ {
"id": 11, "id": 11,
@ -1901,7 +1903,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -22,7 +22,24 @@ data:
- orgId: 1 - orgId: 1
receiver: email-admins receiver: email-admins
group_by: group_by:
- grafana_folder
- alertname - alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: | rules.yaml: |
apiVersion: 1 apiVersion: 1
groups: groups:
@ -32,7 +49,7 @@ data:
interval: 1m interval: 1m
rules: rules:
- uid: disk-pressure-root - uid: disk-pressure-root
title: "Node rootfs high (>80%)" title: "Node rootfs high (>85%)"
condition: C condition: C
for: "10m" for: "10m"
data: data:
@ -66,7 +83,7 @@ data:
type: threshold type: threshold
conditions: conditions:
- evaluator: - evaluator:
params: [80] params: [85]
type: gt type: gt
operator: operator:
type: and type: and
@ -76,7 +93,7 @@ data:
noDataState: NoData noDataState: NoData
execErrState: Error execErrState: Error
annotations: annotations:
summary: "{{ $labels.node }} rootfs >80% for 10m" summary: "{{ $labels.node }} rootfs >85% for 10m"
labels: labels:
severity: warning severity: warning
- uid: disk-growth-1h - uid: disk-growth-1h
@ -145,7 +162,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{instance}}' legendFormat: '{{instance}}'
datasource: datasource:
type: prometheus type: prometheus
@ -286,8 +303,8 @@ data:
summary: "node-image-sweeper not fully ready" summary: "node-image-sweeper not fully ready"
labels: labels:
severity: warning severity: warning
- uid: maint-cron-stale - uid: maint-ariadne-image-sweeper-stale
title: "Maintenance CronJobs stale (>3h since success)" title: "Ariadne image sweeper stale (schedule >8d)"
condition: C condition: C
for: "5m" for: "5m"
data: data:
@ -297,10 +314,10 @@ data:
to: 0 to: 0
datasourceUid: atlas-vm datasourceUid: atlas-vm
model: model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
legendFormat: '{{cronjob}}' legendFormat: '{{task}}'
datasource: datasource:
type: prometheus type: prometheus
uid: atlas-vm uid: atlas-vm
@ -321,17 +338,166 @@ data:
type: threshold type: threshold
conditions: conditions:
- evaluator: - evaluator:
params: [10800] params: [691200]
type: gt type: gt
operator: operator:
type: and type: and
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Maintenance cronjob stale >3h since last success" summary: "Ariadne image sweeper stale >8d since last success"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (legacy disabled)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: legacy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: OK
annotations:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
labels: labels:
severity: warning severity: warning
- orgId: 1 - orgId: 1
@ -352,7 +518,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d legendFormat: bounce 1d
datasource: datasource:
type: prometheus type: prometheus
@ -381,7 +547,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark 1d bounce rate >5%" summary: "Postmark 1d bounce rate >5%"
@ -400,7 +566,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: POSTMARK_API_UP expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up legendFormat: api up
datasource: datasource:
type: prometheus type: prometheus
@ -429,7 +595,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark exporter reports API down" summary: "Postmark exporter reports API down"

View File

@ -29,7 +29,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -98,7 +98,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -1134,7 +1134,7 @@ data:
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne CI Coverage (%)", "title": "Platform CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1192,12 +1192,13 @@ data:
"values": false "values": false
}, },
"textMode": "value" "textMode": "value"
} },
"description": "Internal source panel for Atlas Overview automation test rollups."
}, },
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne CI Tests (latest)", "title": "Platform CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1210,7 +1211,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1242,7 +1243,8 @@ data:
"order": "desc" "order": "desc"
} }
} }
] ],
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
} }
], ],
"time": { "time": {

View File

@ -1686,7 +1686,7 @@ data:
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne Test Success Rate", "title": "Platform Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1718,12 +1718,13 @@ data:
"tooltip": { "tooltip": {
"mode": "multi" "mode": "multi"
} }
} },
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
}, },
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Tests with Failures (24h)", "title": "Platform Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1736,7 +1737,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true
@ -1823,7 +1824,8 @@ data:
"order": "desc" "order": "desc"
} }
} }
] ],
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
}, },
{ {
"id": 11, "id": 11,
@ -1910,7 +1912,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -286,7 +286,7 @@ spec:
podAnnotations: podAnnotations:
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "1" monitoring.bstein.dev/restart-rev: "6"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: | vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }} {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}

View File

@ -43,6 +43,12 @@ spec:
value: /var/run/secrets/vault-token-reviewer/token value: /var/run/secrets/vault-token-reviewer/token
- name: VAULT_K8S_ROLE_TTL - name: VAULT_K8S_ROLE_TTL
value: 1h value: 1h
- name: VAULT_K8S_BOUND_AUDIENCES
value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
- name: VAULT_K8S_ISSUER
value: https://kubernetes.default.svc.cluster.local
- name: VAULT_K8S_DISABLE_ISS_VALIDATION
value: "false"
volumeMounts: volumeMounts:
- name: k8s-auth-config-script - name: k8s-auth-config-script
mountPath: /scripts mountPath: /scripts

View File

@ -53,6 +53,8 @@ ensure_token
k8s_host="https://${KUBERNETES_SERVICE_HOST}:443" k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)" k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
k8s_issuer="${VAULT_K8S_ISSUER:-}"
disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
role_ttl="${VAULT_K8S_ROLE_TTL:-1h}" role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}" token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
vault_cmd auth enable kubernetes vault_cmd auth enable kubernetes
fi fi
ensure_default_policy_login() {
default_policy="$(vault_cmd policy read default)"
if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
return
fi
log "updating default policy to allow kubernetes login"
default_policy="${default_policy}
path \"auth/kubernetes/login\" {
capabilities = [\"create\", \"update\"]
}
"
printf '%s\n' "${default_policy}" | vault_cmd policy write default -
}
log "configuring kubernetes auth" log "configuring kubernetes auth"
vault_cmd write auth/kubernetes/config \ if [ -n "${k8s_issuer}" ]; then
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}" \
issuer="${k8s_issuer}" \
disable_iss_validation="${disable_iss_validation}"
else
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \ token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \ kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}" kubernetes_ca_cert="${k8s_ca}"
fi
ensure_default_policy_login
write_raw_policy() { write_raw_policy() {
name="$1" name="$1"
@ -87,6 +114,7 @@ write_policy_and_role() {
service_accounts="$3" service_accounts="$3"
read_paths="$4" read_paths="$4"
write_paths="$5" write_paths="$5"
audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
policy_body="" policy_body=""
for path in ${read_paths}; do for path in ${read_paths}; do
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
} }
" "
done done
if [ "${role}" = "maintenance" ]; then
policy_body="${policy_body}
path \"sys/auth\" {
capabilities = [\"read\"]
}
path \"sys/auth/*\" {
capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
}
path \"auth/kubernetes/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"auth/oidc/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"sys/policies/acl\" {
capabilities = [\"list\"]
}
path \"sys/policies/acl/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
"
fi
log "writing policy ${role}" log "writing policy ${role}"
printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" - printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
log "writing role ${role}" log "writing role ${role}"
if [ -n "${audiences}" ]; then
vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_audiences="${audiences}" \
bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \
policies="${role}" \
ttl="${role_ttl}"
return
fi
vault_cmd write "auth/kubernetes/role/${role}" \ vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_names="${service_accounts}" \ bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \ bound_service_account_namespaces="${namespace}" \
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
"nextcloud/* shared/keycloak-admin shared/postmark-relay" "" "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "ai" "ai" "atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
"jenkins/* shared/harbor-pull" "" "jenkins/* shared/harbor-pull" ""
write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
write_policy_and_role "health" "health" "health-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \
"health/*" "" "health/*" ""
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
"maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" "" "maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
write_policy_and_role "finance" "finance" "finance-vault" \ write_policy_and_role "finance" "finance" "finance-vault" \
"finance/* shared/postmark-relay" "" "finance/* shared/postmark-relay" ""
write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \