Compare commits

...

383 Commits

Author SHA1 Message Date
fb5064fa17 maintenance: grant ariadne auth-delegator 2026-02-08 09:55:20 -03:00
e29b31ff42 ariadne: use vault-admin role for vault config 2026-02-07 22:34:10 -03:00
ab33af3401 ariadne: run image sweeper daily 2026-02-07 11:11:41 -03:00
f275764b15 gitea: prefer rpi5 nodes 2026-02-07 11:07:02 -03:00
flux-bot
d082bee3bc chore(maintenance): automated image update 2026-02-07 13:56:49 +00:00
b1ecc3eef8 maintenance: align vault role env 2026-02-07 10:51:20 -03:00
flux-bot
6e6beb071b chore(atlasbot): automated image update 2026-02-07 13:50:38 +00:00
c9a9c801ec infra: bias gitea/monerod placement, bump synapse ensure job 2026-02-07 10:48:48 -03:00
4edc888246 comms: run synapse admin ensure (admin flag) 2026-02-07 10:30:34 -03:00
917ee077ad comms: ensure synapse admin flag; ariadne vault role 2026-02-07 10:28:55 -03:00
8c931f6a58 ariadne: accelerate schedules for alert clearing 2026-02-07 03:23:42 -03:00
7ec1b812d6 comms: re-suspend synapse admin job 2026-02-07 03:19:42 -03:00
62376138dd comms: run synapse admin ensure 2026-02-07 03:16:44 -03:00
0bab0deedf vault: allow maintenance auth sync 2026-02-07 03:13:53 -03:00
411ad0e4ba crypto: run xmrig only on rpi5 2026-02-06 23:34:31 -03:00
7c419748b7 comms: suspend synapse admin ensure 2026-02-06 20:21:01 -03:00
c901a0a0cb comms: run synapse admin ensure with python image 2026-02-06 20:13:02 -03:00
6df05c9adc comms: run synapse admin ensure 2026-02-06 20:01:38 -03:00
578ccd97e5 jenkins: add dind cache pvc 2026-02-06 20:00:01 -03:00
5e0c5b200c gitea: avoid longhorn nodes 2026-02-06 19:33:55 -03:00
28278d6c67 maintenance: pivot soteria to longhorn 2026-02-06 18:38:29 -03:00
e3ab256336 longhorn: add b2 backup target 2026-02-06 18:28:37 -03:00
flux-bot
dd02a49626 chore(maintenance): automated image update 2026-02-06 21:27:42 +00:00
d430a480f0 maintenance: restore soteria job node selector 2026-02-06 04:19:36 -03:00
0d09492984 maintenance: pin soteria jobs to titan-24 for backup 2026-02-06 04:15:58 -03:00
45b2c79c72 maintenance: pin soteria jobs to arm64 workers 2026-02-06 04:10:55 -03:00
flux-bot
aad8e11b37 chore(maintenance): automated image update 2026-02-06 07:10:04 +00:00
cfebff5f08 maintenance: remove restic init job 2026-02-06 03:50:30 -03:00
e463674ca9 maintenance: add restic init job 2026-02-06 03:48:45 -03:00
flux-bot
a6ceaa4cf1 chore(maintenance): automated image update 2026-02-05 18:56:27 +00:00
flux-bot
64b70bf391 chore(maintenance): automated image update 2026-02-05 18:45:20 +00:00
543880d06f maintenance: schedule soteria on rpi workers 2026-02-05 15:30:09 -03:00
flux-bot
873f392b88 chore(bstein-dev-home): automated image update 2026-02-05 18:24:54 +00:00
flux-bot
286925857a chore(maintenance): automated image update 2026-02-05 18:24:44 +00:00
36311b877b harbor: route registry traffic via core 2026-02-05 15:23:42 -03:00
cd2e2dff17 harbor: wire registryctl notification auth 2026-02-05 15:17:54 -03:00
b5a357d477 harbor: fix registry notification URL 2026-02-05 15:00:43 -03:00
bda5871035 harbor: restore registry notifications env 2026-02-05 14:50:53 -03:00
89490d5aa5 harbor: enable registry notifications 2026-02-05 14:44:09 -03:00
flux-bot
97de9b6d18 chore(atlasbot): automated image update 2026-02-05 17:38:26 +00:00
flux-bot
e5ceb234c3 chore(maintenance): automated image update 2026-02-05 17:04:24 +00:00
flux-bot
a714c9994a chore(maintenance): automated image update 2026-02-05 16:32:49 +00:00
ced6d511ff maintenance: move soteria image to bstein 2026-02-05 13:12:03 -03:00
flux-bot
e0600baa4b chore(atlasbot): automated image update 2026-02-05 15:58:19 +00:00
flux-bot
557ccb7bbd chore(atlasbot): automated image update 2026-02-05 01:26:05 +00:00
c16113088e atlasbot: set genius model env 2026-02-04 19:39:43 -03:00
flux-bot
18524a0065 chore(atlasbot): automated image update 2026-02-04 22:15:47 +00:00
flux-bot
0a7e05a735 chore(atlasbot): automated image update 2026-02-04 21:45:45 +00:00
flux-bot
14d90298e8 chore(atlasbot): automated image update 2026-02-04 21:39:45 +00:00
flux-bot
2523ebee2a chore(atlasbot): automated image update 2026-02-04 19:08:32 +00:00
flux-bot
76f27b7eed chore(atlasbot): automated image update 2026-02-04 18:09:26 +00:00
flux-bot
4abf16687b chore(atlasbot): automated image update 2026-02-04 18:03:26 +00:00
flux-bot
b20922b3ec chore(atlasbot): automated image update 2026-02-04 17:56:26 +00:00
flux-bot
6007050545 chore(atlasbot): automated image update 2026-02-04 17:49:23 +00:00
flux-bot
1b9c78166e chore(atlasbot): automated image update 2026-02-04 17:30:22 +00:00
flux-bot
b8c5f547aa chore(atlasbot): automated image update 2026-02-04 17:23:23 +00:00
flux-bot
8bc999a7f2 chore(atlasbot): automated image update 2026-02-04 17:20:23 +00:00
flux-bot
93cb39cd23 chore(atlasbot): automated image update 2026-02-04 17:14:21 +00:00
flux-bot
6c84d63500 chore(atlasbot): automated image update 2026-02-04 17:07:21 +00:00
flux-bot
9b341a865d chore(atlasbot): automated image update 2026-02-04 17:00:21 +00:00
flux-bot
66541c29ca chore(atlasbot): automated image update 2026-02-04 16:53:20 +00:00
flux-bot
00c8be0dd8 chore(atlasbot): automated image update 2026-02-04 16:45:19 +00:00
flux-bot
57d672c264 chore(atlasbot): automated image update 2026-02-04 16:39:18 +00:00
flux-bot
3252409a7b chore(atlasbot): automated image update 2026-02-04 14:03:05 +00:00
flux-bot
e656120be9 chore(atlasbot): automated image update 2026-02-04 03:27:09 +00:00
flux-bot
49151ad13e chore(atlasbot): automated image update 2026-02-04 03:01:07 +00:00
flux-bot
9b7778f193 chore(atlasbot): automated image update 2026-02-04 02:54:06 +00:00
flux-bot
56cd01f4d1 chore(atlasbot): automated image update 2026-02-04 02:42:05 +00:00
flux-bot
c6c1ec9129 chore(atlasbot): automated image update 2026-02-04 02:30:04 +00:00
flux-bot
3a39e0972e chore(atlasbot): automated image update 2026-02-04 01:54:01 +00:00
flux-bot
5d87aefc4b chore(maintenance): automated image update 2026-02-04 01:51:59 +00:00
flux-bot
cf36ed6279 chore(atlasbot): automated image update 2026-02-04 01:27:59 +00:00
flux-bot
d88648bdf8 chore(atlasbot): automated image update 2026-02-04 01:09:57 +00:00
flux-bot
d15779e6dc chore(atlasbot): automated image update 2026-02-04 00:55:56 +00:00
flux-bot
2992b8c581 chore(atlasbot): automated image update 2026-02-04 00:42:56 +00:00
flux-bot
0cc49081ff chore(atlasbot): automated image update 2026-02-04 00:37:55 +00:00
flux-bot
e6b8e4d39e chore(atlasbot): automated image update 2026-02-04 00:34:55 +00:00
flux-bot
cb42182358 chore(atlasbot): automated image update 2026-02-04 00:19:53 +00:00
flux-bot
12b81b2f0d chore(atlasbot): automated image update 2026-02-03 22:41:45 +00:00
flux-bot
454017d7ea chore(atlasbot): automated image update 2026-02-03 22:06:41 +00:00
flux-bot
4ebf2ad742 chore(atlasbot): automated image update 2026-02-03 20:18:32 +00:00
flux-bot
999be05fd9 chore(atlasbot): automated image update 2026-02-03 19:56:31 +00:00
flux-bot
cdb94ee7a4 chore(atlasbot): automated image update 2026-02-03 19:29:28 +00:00
flux-bot
e259ab8a8d chore(atlasbot): automated image update 2026-02-03 18:04:21 +00:00
flux-bot
8630e626fe chore(atlasbot): automated image update 2026-02-03 17:53:20 +00:00
flux-bot
3d655dda4f chore(atlasbot): automated image update 2026-02-03 17:42:19 +00:00
flux-bot
0f935f7a78 chore(atlasbot): automated image update 2026-02-03 17:34:18 +00:00
flux-bot
e4629ec198 chore(atlasbot): automated image update 2026-02-03 17:16:17 +00:00
flux-bot
30b024dfc1 chore(atlasbot): automated image update 2026-02-03 15:15:07 +00:00
flux-bot
194404619b chore(atlasbot): automated image update 2026-02-03 15:05:06 +00:00
flux-bot
77919cbf20 chore(atlasbot): automated image update 2026-02-03 14:57:06 +00:00
flux-bot
51db4e0612 chore(atlasbot): automated image update 2026-02-03 14:51:05 +00:00
flux-bot
42adbe98c0 chore(atlasbot): automated image update 2026-02-03 14:40:04 +00:00
flux-bot
65e3947f5a chore(atlasbot): automated image update 2026-02-03 14:15:01 +00:00
flux-bot
2b52d07f95 chore(atlasbot): automated image update 2026-02-03 14:07:01 +00:00
flux-bot
ce020e06c0 chore(atlasbot): automated image update 2026-02-03 13:43:59 +00:00
flux-bot
f5437db369 chore(atlasbot): automated image update 2026-02-03 13:22:57 +00:00
flux-bot
455a58b982 chore(atlasbot): automated image update 2026-02-03 13:08:56 +00:00
flux-bot
3e044ed3fc chore(atlasbot): automated image update 2026-02-03 13:04:56 +00:00
flux-bot
68d794c909 chore(atlasbot): automated image update 2026-02-03 12:56:55 +00:00
flux-bot
709ec5d039 chore(atlasbot): automated image update 2026-02-03 12:32:53 +00:00
flux-bot
33b9d678c1 chore(atlasbot): automated image update 2026-02-03 07:33:28 +00:00
flux-bot
c2ffad3937 chore(atlasbot): automated image update 2026-02-03 06:31:22 +00:00
flux-bot
ed73f69d60 chore(atlasbot): automated image update 2026-02-03 06:07:21 +00:00
flux-bot
773a9526dc chore(atlasbot): automated image update 2026-02-03 04:57:14 +00:00
80d7c585e1 atlasbot: raise llm call caps 2026-02-03 01:55:21 -03:00
flux-bot
1272357177 chore(atlasbot): automated image update 2026-02-03 03:29:07 +00:00
flux-bot
47e2d706c4 chore(atlasbot): automated image update 2026-02-03 03:26:07 +00:00
ca5393bf4c jenkins(atlasbot): set main branch 2026-02-02 23:12:13 -03:00
dc83ead648 jenkins(atlasbot): use main branch 2026-02-02 23:10:42 -03:00
72d3dffd1e jenkins(atlasbot): track main branch 2026-02-02 22:25:56 -03:00
a2833f3c26 ci(atlasbot): add Jenkins job and image automation 2026-02-02 20:25:47 -03:00
flux-bot
98c5981869 chore(atlasbot): automated image update 2026-02-02 21:04:06 +00:00
flux-bot
53088cc82d chore(atlasbot): automated image update 2026-02-02 20:22:02 +00:00
0ae534e387 vault: add default k8s audience 2026-02-02 17:15:35 -03:00
flux-bot
e5fbc8f6ed chore(atlasbot): automated image update 2026-02-02 20:08:02 +00:00
flux-bot
791a14a9e5 chore(atlasbot): automated image update 2026-02-02 19:53:00 +00:00
flux-bot
9f29205201 chore(atlasbot): automated image update 2026-02-02 18:13:52 +00:00
flux-bot
bae9b1bfc2 chore(atlasbot): automated image update 2026-02-02 18:04:51 +00:00
44f376b492 atlasbot: bump image to 0.1.0-133 2026-02-02 14:58:38 -03:00
flux-bot
e0410cfa33 chore(atlasbot): automated image update 2026-02-02 17:56:53 +00:00
flux-bot
45b86a3478 chore(atlasbot): automated image update 2026-02-02 17:56:48 +00:00
8523f7bc91 atlasbot: bump image to 0.1.0-132 2026-02-02 14:56:24 -03:00
f2b8f79a7a atlasbot: bump image to 0.1.0-131 2026-02-02 14:54:36 -03:00
d6a4c7f888 atlasbot: bump image to 0.1.0-130 2026-02-02 14:48:34 -03:00
7bd069cb3b atlasbot: bump image to 0.1.0-129 2026-02-02 14:41:22 -03:00
flux-bot
80e94c7d67 chore(atlasbot): automated image update 2026-02-02 17:32:49 +00:00
157c93f2a9 atlasbot: disable queue for testing 2026-02-02 14:24:09 -03:00
flux-bot
c259c5abe4 chore(atlasbot): automated image update 2026-02-02 17:13:47 +00:00
flux-bot
f1d628682b chore(atlasbot): automated image update 2026-02-02 16:55:46 +00:00
flux-bot
d375d8a680 chore(atlasbot): automated image update 2026-02-02 16:45:45 +00:00
flux-bot
9c297f6609 chore(atlasbot): automated image update 2026-02-02 16:38:44 +00:00
97b2385aa2 atlasbot: bump image to 0.1.0-123 2026-02-02 13:30:34 -03:00
3cdde19de0 atlasbot: bump image to 0.1.0-122 2026-02-02 13:21:28 -03:00
flux-bot
af99e0e315 chore(atlasbot): automated image update 2026-02-02 16:10:42 +00:00
flux-bot
7905da3b9a chore(atlasbot): automated image update 2026-02-02 15:57:41 +00:00
flux-bot
0bff5b0835 chore(atlasbot): automated image update 2026-02-02 15:47:40 +00:00
69744225bb atlasbot: bump image to 0.1.0-118 2026-02-02 12:39:24 -03:00
flux-bot
b15c9a6a63 chore(atlasbot): automated image update 2026-02-02 15:20:38 +00:00
flux-bot
23a67f0ddf chore(atlasbot): automated image update 2026-02-02 15:17:37 +00:00
f8c04770a3 jenkins: reload jcasc for soteria 2026-02-02 12:11:07 -03:00
flux-bot
e489ffca7c chore(atlasbot): automated image update 2026-02-02 15:09:37 +00:00
80cb818fa2 atlasbot: bump image to 0.1.0-114 2026-02-02 12:05:58 -03:00
flux-bot
d295eec276 chore(atlasbot): automated image update 2026-02-02 15:01:36 +00:00
896b4c4890 atlasbot: bump image to 0.1.0-112 2026-02-02 11:52:59 -03:00
flux-bot
82f9147c7f chore(atlasbot): automated image update 2026-02-02 14:49:35 +00:00
1f476b8541 atlasbot: bump image to 0.1.0-110 2026-02-02 11:42:03 -03:00
flux-bot
d80e6bb6b6 chore(atlasbot): automated image update 2026-02-02 14:36:35 +00:00
9fe547aa09 atlasbot: bump image to 0.1.0-108 2026-02-02 11:23:53 -03:00
090a22a0b5 atlasbot: bump image to 0.1.0-107 2026-02-02 11:14:54 -03:00
05d6ee9d6e jenkins: add soteria pipeline job 2026-02-02 11:01:22 -03:00
a7c1774044 atlasbot: bump image to 0.1.0-106 2026-02-02 11:00:18 -03:00
b3b4cbecdd add ai harbor regcred sync 2026-02-02 10:08:46 -03:00
13f59fb5e7 bump atlasbot image 2026-02-02 10:05:06 -03:00
fa8777d056 track atlasbot knowledge index 2026-02-02 09:48:40 -03:00
4f2ae810a5 move atlasbot to ai namespace 2026-02-02 09:46:50 -03:00
flux-bot
382ccfe0f1 chore(comms): automated image update 2026-02-02 06:03:16 +00:00
flux-bot
af61d2109d chore(comms): automated image update 2026-02-02 05:59:16 +00:00
flux-bot
c98b69e368 chore(comms): automated image update 2026-02-02 05:59:04 +00:00
ccd92f6014 comms: bump atlasbot to 0.1.0-103 2026-02-02 02:58:44 -03:00
flux-bot
ab672773dd chore(comms): automated image update 2026-02-02 05:49:15 +00:00
flux-bot
d1a490a80a chore(comms): automated image update 2026-02-02 05:46:15 +00:00
flux-bot
a9d235695a chore(comms): automated image update 2026-02-02 05:45:54 +00:00
1f19ae46f5 comms: bump atlasbot to 0.1.0-101 2026-02-02 02:45:33 -03:00
flux-bot
f833b61a76 chore(comms): automated image update 2026-02-02 05:39:14 +00:00
b9d660fc9a comms: bump atlasbot to 0.1.0-99 2026-02-02 02:16:31 -03:00
300b13f995 comms: bump atlasbot to 0.1.0-98 2026-02-02 02:09:57 -03:00
17d8ca3b2a comms: bump atlasbot to 0.1.0-97 2026-02-02 02:03:50 -03:00
flux-bot
27c7aade1c chore(comms): automated image update 2026-02-02 05:02:11 +00:00
dae28077b5 comms: bump atlasbot to 0.1.0-96 2026-02-02 01:57:58 -03:00
b202dacfb1 comms: bump atlasbot to 0.1.0-95 2026-02-02 01:54:41 -03:00
d72d21268b comms: bump atlasbot to 0.1.0-94 2026-02-02 01:45:52 -03:00
58dc219452 comms: bump atlasbot to 0.1.0-93 2026-02-02 01:38:59 -03:00
7c6a731c7c comms: bump atlasbot to 0.1.0-92 2026-02-01 18:46:01 -03:00
f0259086fd comms: bump atlasbot to 0.1.0-91 2026-02-01 18:42:00 -03:00
flux-bot
c3cf0e7900 chore(comms): automated image update 2026-02-01 21:37:32 +00:00
flux-bot
a0e52401fc chore(comms): automated image update 2026-02-01 21:33:32 +00:00
flux-bot
51fc85587a chore(comms): automated image update 2026-02-01 21:25:31 +00:00
5d9526af73 comms: bump atlasbot to 0.1.0-87 2026-02-01 18:05:00 -03:00
flux-bot
b537a7def8 chore(comms): automated image update 2026-02-01 20:55:29 +00:00
2f2a6f9132 comms: bump atlasbot to 0.1.0-85 2026-02-01 17:48:24 -03:00
flux-bot
01d0a16210 chore(comms): automated image update 2026-02-01 20:43:28 +00:00
d1ac654e99 comms: bump atlasbot to 0.1.0-83 2026-02-01 14:45:58 -03:00
flux-bot
42a8c48426 chore(comms): automated image update 2026-02-01 17:39:12 +00:00
0cc155cfcc comms: bump atlasbot to 0.1.0-81 2026-02-01 14:34:43 -03:00
c7189bff8c comms: bump atlasbot to 0.1.0-80 2026-02-01 14:28:34 -03:00
3ca0df9529 comms: bump atlasbot to 0.1.0-79 2026-02-01 14:07:57 -03:00
flux-bot
1c9259b5b4 chore(comms): automated image update 2026-02-01 15:41:02 +00:00
flux-bot
387bfadc76 chore(comms): automated image update 2026-02-01 15:36:01 +00:00
50bdd18c56 comms: bump atlasbot image 2026-02-01 12:25:31 -03:00
19d22abd0f vault: fix k8s auth env indent 2026-02-01 12:20:04 -03:00
fd8396730c vault: set kubernetes issuer 2026-02-01 12:18:57 -03:00
flux-bot
dcabfb2ebb chore(comms): automated image update 2026-02-01 15:01:58 +00:00
flux-bot
5d5b5c031e chore(comms): automated image update 2026-02-01 14:55:58 +00:00
flux-bot
d79cce72af chore(comms): automated image update 2026-02-01 14:55:52 +00:00
9147d4107f comms: bump atlasbot image 2026-02-01 11:55:26 -03:00
flux-bot
a21e58ad2c chore(comms): automated image update 2026-02-01 14:47:57 +00:00
flux-bot
10db6b4973 chore(comms): automated image update 2026-02-01 14:18:55 +00:00
7c6a91d758 vault: set k8s auth audiences 2026-02-01 11:17:02 -03:00
flux-bot
ced41aa633 chore(comms): automated image update 2026-02-01 13:58:53 +00:00
cb2026b511 atlasbot: bump to 0.1.0-70 2026-02-01 10:37:29 -03:00
flux-bot
38d826e4ea chore(comms): automated image update 2026-02-01 08:40:26 +00:00
flux-bot
4a30c5c706 chore(comms): automated image update 2026-02-01 08:40:09 +00:00
c43efe20dc atlasbot: bump to 0.1.0-69 2026-02-01 05:39:54 -03:00
a80047060f comms: bump atlasbot image 2026-02-01 05:31:07 -03:00
flux-bot
00b84f3b89 chore(comms): automated image update 2026-02-01 08:23:24 +00:00
flux-bot
06a559c7ea chore(comms): automated image update 2026-02-01 08:18:24 +00:00
eae548bdd1 comms: bump atlasbot image 2026-02-01 05:12:59 -03:00
flux-bot
8024db9d11 chore(comms): automated image update 2026-02-01 08:10:23 +00:00
flux-bot
5dcbef6be3 chore(comms): automated image update 2026-02-01 07:55:22 +00:00
flux-bot
f3420b79bf chore(comms): automated image update 2026-02-01 07:49:21 +00:00
flux-bot
e7588ac06f chore(comms): automated image update 2026-02-01 07:46:21 +00:00
flux-bot
9272dcf0e6 chore(comms): automated image update 2026-02-01 07:38:20 +00:00
76e44e949c comms: bump atlasbot to 0.1.0-59 2026-02-01 04:32:01 -03:00
0915ed2205 comms: bump atlasbot to 0.1.0-58 2026-02-01 04:25:12 -03:00
flux-bot
187c0c2c88 chore(comms): automated image update 2026-02-01 07:17:18 +00:00
flux-bot
ae12b18d6f chore(comms): automated image update 2026-02-01 06:39:16 +00:00
080e583cf9 comms: bump atlasbot to 0.1.0-55 2026-02-01 02:08:54 -03:00
flux-bot
a1b4c878c2 chore(comms): automated image update 2026-02-01 05:07:08 +00:00
22bfb15bd6 comms: bump atlasbot to 0.1.0-54 2026-02-01 01:51:26 -03:00
flux-bot
8948523474 chore(comms): automated image update 2026-02-01 04:51:06 +00:00
303defd745 comms: bump atlasbot to 0.1.0-53 2026-02-01 01:39:09 -03:00
flux-bot
5fb9e77634 chore(comms): automated image update 2026-02-01 04:39:05 +00:00
454a759cda comms: bump atlasbot to 0.1.0-52 2026-02-01 01:29:30 -03:00
flux-bot
26ff18e983 chore(comms): automated image update 2026-02-01 04:29:04 +00:00
0b77c3bc2c comms: bump atlasbot to 0.1.0-51 2026-02-01 01:15:18 -03:00
flux-bot
58eecc72bd chore(comms): automated image update 2026-02-01 04:15:03 +00:00
2e0ac03458 comms(atlasbot): bump image to 0.1.0-50 2026-01-31 22:30:04 -03:00
flux-bot
8e74a4c0b3 chore(comms): automated image update 2026-02-01 01:28:49 +00:00
0aed7fe3ee comms: bump atlasbot image 2026-01-31 21:40:11 -03:00
flux-bot
867172600f chore(maintenance): automated image update 2026-02-01 00:39:49 +00:00
flux-bot
d915a2a9d7 chore(comms): automated image update 2026-02-01 00:39:45 +00:00
4e92168abf comms: disable atlasbot queue for tests 2026-01-31 18:21:39 -03:00
ddf51be7aa comms: bump atlasbot to 0.1.0-48 2026-01-31 18:14:55 -03:00
flux-bot
20447bb33b chore(comms): automated image update 2026-01-31 21:14:27 +00:00
5863e09ec8 atlasbot: make node counts explicit 2026-01-31 16:44:50 -03:00
0cf33797ae atlasbot: prioritize high-priority subquestions 2026-01-31 16:38:54 -03:00
71fbd8a6c1 atlasbot: expand chunk summaries 2026-01-31 16:35:02 -03:00
d1f06aeb1b atlasbot: enable debug pipeline logging 2026-01-31 16:30:05 -03:00
flux-bot
ce39b3ae98 chore(comms): automated image update 2026-01-31 19:29:18 +00:00
6021f2f283 atlasbot: bump to 0.1.0-43 2026-01-31 14:24:13 -03:00
flux-bot
4629548f99 chore(comms): automated image update 2026-01-31 17:21:08 +00:00
1e0a2628b6 atlasbot: bump image to 0.1.0-42 2026-01-31 14:15:41 -03:00
flux-bot
8594b736ec chore(comms): automated image update 2026-01-31 17:15:07 +00:00
flux-bot
9df8ae2d38 chore(maintenance): automated image update 2026-01-31 16:42:06 +00:00
flux-bot
d890b09dcd chore(maintenance): automated image update 2026-01-31 16:39:06 +00:00
080b94aa8b atlasbot: bump image to 0.1.0-41 2026-01-31 13:26:44 -03:00
flux-bot
d7070647aa chore(comms): automated image update 2026-01-31 16:25:03 +00:00
flux-bot
3b321c5320 chore(comms): automated image update 2026-01-31 11:08:36 +00:00
3e2699758f atlasbot: bump image to 0.1.0-40 2026-01-31 08:08:21 -03:00
2c4ad486a5 comms: fix atlasbot image indentation 2026-01-31 07:17:58 -03:00
flux-bot
85f749a481 chore(comms): automated image update 2026-01-31 10:12:32 +00:00
75d3946276 atlasbot: bump image to 0.1.0-39 2026-01-31 07:11:56 -03:00
e95ea959bb atlasbot: bump image to 0.1.0-38 2026-01-31 06:18:58 -03:00
flux-bot
63bf109acb chore(comms): automated image update 2026-01-31 09:18:28 +00:00
982b401a8c maintenance: add soteria service 2026-01-31 03:35:39 -03:00
f4684092be atlasbot: bump image to 0.1.0-37 2026-01-31 03:20:44 -03:00
flux-bot
97f0f0b508 chore(comms): automated image update 2026-01-31 06:20:12 +00:00
f1f0543885 ariadne: add alertmanager url 2026-01-30 21:57:05 -03:00
flux-bot
241c305034 chore(maintenance): automated image update 2026-01-31 00:54:47 +00:00
flux-bot
444cd3a04b chore(maintenance): automated image update 2026-01-31 00:40:46 +00:00
flux-bot
f11329ad02 chore(maintenance): automated image update 2026-01-30 23:54:41 +00:00
af055bacb4 comms: suspend mas-local-users-ensure 2026-01-30 17:46:46 -03:00
da08a2687d comms: bump mas-local-users-ensure job 2026-01-30 17:44:42 -03:00
e733ee64da comms: bump comms-secrets-ensure job 2026-01-30 17:42:28 -03:00
3ef7eed9a9 comms: run mas-local-users-ensure job (retry) 2026-01-30 17:37:42 -03:00
6a11b0fcfb comms: suspend mas-local-users-ensure job 2026-01-30 17:33:55 -03:00
c4f404df23 comms: run mas-local-users-ensure job 2026-01-30 17:29:29 -03:00
flux-bot
ae2f7cadeb chore(maintenance): automated image update 2026-01-30 20:18:24 +00:00
3c63722a4b comms: add atlas-genius bot 2026-01-30 17:17:59 -03:00
flux-bot
86c859bc14 chore(comms): automated image update 2026-01-30 20:07:20 +00:00
flux-bot
7d0537f2a2 chore(bstein-dev-home): automated image update 2026-01-30 20:05:30 +00:00
flux-bot
a8424251aa chore(bstein-dev-home): automated image update 2026-01-30 20:02:30 +00:00
flux-bot
180f814520 chore(comms): automated image update 2026-01-30 19:53:19 +00:00
flux-bot
0c3d2f918d chore(comms): automated image update 2026-01-30 19:42:22 +00:00
flux-bot
20b3193cb8 chore(maintenance): automated image update 2026-01-30 13:21:48 +00:00
a851e184ca atlasbot: support quick/smart Matrix accounts 2026-01-30 10:21:07 -03:00
flux-bot
a5d4c63cd3 chore(maintenance): automated image update 2026-01-30 05:19:07 +00:00
flux-bot
27a8a701ef chore(maintenance): automated image update 2026-01-30 03:15:56 +00:00
flux-bot
0ee2b8b059 chore(comms): automated image update 2026-01-29 23:54:42 +00:00
flux-bot
beefc5ab31 chore(maintenance): automated image update 2026-01-29 19:56:19 +00:00
92d557ed98 comms: bump atlasbot to 0.1.0-32 2026-01-29 16:51:43 -03:00
flux-bot
ed9a84aaa6 chore(comms): automated image update 2026-01-29 19:50:22 +00:00
f5b6c7ed97 comms: bump atlasbot to 0.1.0-31 2026-01-29 16:09:15 -03:00
flux-bot
22c90e301e chore(comms): automated image update 2026-01-29 19:08:18 +00:00
fadf071b31 comms: bump atlasbot to 0.1.0-30 2026-01-29 14:56:59 -03:00
flux-bot
5c69ea62a2 chore(comms): automated image update 2026-01-29 17:55:12 +00:00
04550a116d comms: bump atlasbot 0.1.0-29 2026-01-29 14:18:51 -03:00
flux-bot
05005ac676 chore(maintenance): automated image update 2026-01-29 16:43:04 +00:00
215edef09d sso: suspend execute-actions email test job 2026-01-29 13:41:41 -03:00
f2b0f76d15 sso: send execute-actions email to robotuser 2026-01-29 13:40:45 -03:00
flux-bot
f3f80ee114 chore(maintenance): automated image update 2026-01-29 16:35:03 +00:00
9affb59632 comms: bump atlasbot to 0.1.0-28 2026-01-29 13:33:39 -03:00
25c0202743 sso: suspend keycloak oneoff jobs 2026-01-29 13:30:10 -03:00
bc9ee3138a sso: rerun execute-actions email test 2026-01-29 13:28:32 -03:00
8ee96e02c4 sso: set keycloak smtp to postmark 2026-01-29 13:27:28 -03:00
b476f757f4 sso: rerun execute-actions email test 2026-01-29 13:23:59 -03:00
d5cbc823c6 sso: run keycloak execute-actions email test 2026-01-29 13:21:40 -03:00
70048e1081 sso: suspend realm settings job 2026-01-29 13:20:11 -03:00
7b5ac0fbb5 sso: rerun keycloak realm settings 2026-01-29 13:10:31 -03:00
96b1ceeb97 monitoring: stabilize alert queries 2026-01-29 13:07:55 -03:00
flux-bot
64782af56c chore(maintenance): automated image update 2026-01-29 16:07:01 +00:00
ce3332503b atlasbot: bump to 0.1.0-27 2026-01-29 13:06:37 -03:00
8351945510 atlasbot: align to installed qwen model 2026-01-29 10:25:57 -03:00
7152829271 atlasbot: align models and bump image 2026-01-29 10:17:38 -03:00
flux-bot
b68b778532 chore(comms): automated image update 2026-01-29 13:16:50 +00:00
flux-bot
61354ef81b chore(maintenance): automated image update 2026-01-29 13:16:46 +00:00
flux-bot
939e5cff89 chore(comms): automated image update 2026-01-29 12:23:45 +00:00
flux-bot
c630a399ac chore(comms): automated image update 2026-01-29 11:47:42 +00:00
flux-bot
79999c959b chore(maintenance): automated image update 2026-01-29 11:43:38 +00:00
flux-bot
7f7a0e1a52 chore(comms): automated image update 2026-01-29 11:42:41 +00:00
flux-bot
8e572bf910 chore(maintenance): automated image update 2026-01-29 10:45:37 +00:00
97779b655d atlasbot: bump image and allow longhorn read 2026-01-29 07:45:24 -03:00
flux-bot
046cc509d9 chore(comms): automated image update 2026-01-29 10:44:37 +00:00
flux-bot
254cc446af chore(comms): automated image update 2026-01-29 09:21:30 +00:00
flux-bot
a8636be9a1 chore(comms): automated image update 2026-01-29 09:16:59 +00:00
ad4cf69498 atlasbot: bump image tag 2026-01-29 06:16:43 -03:00
flux-bot
c7fc5fd411 chore(maintenance): automated image update 2026-01-29 09:12:26 +00:00
2828b19cf7 maintenance(ariadne): allow apps/events, bump image tag 2026-01-29 06:09:36 -03:00
flux-bot
fbb1df323b chore(maintenance): automated image update 2026-01-29 09:01:41 +00:00
flux-bot
150036626b chore(comms): automated image update 2026-01-29 08:01:25 +00:00
91e6d5740d vault: allow kubernetes auth login 2026-01-29 02:22:51 -03:00
flux-bot
a108590d7a chore(maintenance): automated image update 2026-01-29 04:58:20 +00:00
flux-bot
22c53aebdd chore(comms): automated image update 2026-01-29 04:58:10 +00:00
flux-bot
8325797b3c chore(comms): automated image update 2026-01-29 04:56:21 +00:00
flux-bot
5498e7d91b chore(maintenance): automated image update 2026-01-29 04:56:05 +00:00
bc817a3c6b images: bump ariadne and atlasbot 2026-01-29 01:55:07 -03:00
flux-bot
a39bc77344 chore(comms): automated image update 2026-01-29 01:35:52 +00:00
flux-bot
8ff45092cb chore(maintenance): automated image update 2026-01-29 01:35:03 +00:00
flux-bot
a8677f36c6 chore(maintenance): automated image update 2026-01-28 23:47:54 +00:00
flux-bot
6391633484 chore(comms): automated image update 2026-01-28 23:46:43 +00:00
flux-bot
cacf4427e9 chore(maintenance): automated image update 2026-01-28 23:43:54 +00:00
flux-bot
9a5e20bd5c chore(maintenance): automated image update 2026-01-28 23:36:54 +00:00
flux-bot
aec7fa43d6 chore(maintenance): automated image update 2026-01-28 23:31:53 +00:00
flux-bot
74cd39d183 chore(comms): automated image update 2026-01-28 22:59:41 +00:00
d286d169a0 monitoring: fix GPU share attribution 2026-01-28 19:08:53 -03:00
454c7fbd20 monitoring: de-dupe ariadne schedule alert 2026-01-28 18:45:15 -03:00
2eca794ccc comms: suspend synapse admin ensure job 2026-01-28 18:39:28 -03:00
9787c19fac maintenance: restart ariadne after synapse token update 2026-01-28 18:37:49 -03:00
1aceaed741 comms: force admin token to use othrys-seeder 2026-01-28 18:35:28 -03:00
9662d36ad3 comms: fix vault_put indentation 2026-01-28 18:31:48 -03:00
caa6b73336 comms: source admin token from seeder access tokens 2026-01-28 18:29:49 -03:00
b2c7ca8cf1 comms: mint synapse admin token with syt_ prefix 2026-01-28 18:20:37 -03:00
22670e4730 comms: rerun synapse admin ensure with device 2026-01-28 18:17:24 -03:00
87c6e085a4 comms: ensure synapse device for admin token 2026-01-28 18:10:55 -03:00
bb0acd4f60 maintenance: restart ariadne after synapse token 2026-01-28 17:59:25 -03:00
171356a351 comms: rotate invalid synapse admin token 2026-01-28 17:57:39 -03:00
250fe22288 comms: rerun synapse admin ensure job 2026-01-28 17:54:53 -03:00
433f3ef3d6 comms: run synapse admin ensure job 2026-01-28 17:50:01 -03:00
1d248bf91a comms: use bundled synapse admin ensure image 2026-01-28 17:47:58 -03:00
06cda3f540 maintenance: restart ariadne to reload secrets 2026-01-28 17:31:25 -03:00
55aa8eb0bb comms: suspend synapse admin ensure job 2026-01-28 17:29:07 -03:00
5e24ec17c9 comms: fix synapse admin ensure vault login 2026-01-28 17:27:39 -03:00
5cf843cb6a comms: rebuild synapse admin ensure job 2026-01-28 17:25:34 -03:00
flux-bot
ee6a6fae8d chore(maintenance): automated image update 2026-01-28 20:21:37 +00:00
baaa5dc79f comms: run synapse admin ensure job 2026-01-28 17:19:55 -03:00
ebf0e4faaa maintenance: track ariadne latest image 2026-01-28 14:04:58 -03:00
a20f3cc0ce monitoring: avoid ariadne alert title conflict 2026-01-28 14:02:12 -03:00
78c72a71a2 monitoring: disable legacy cron alert 2026-01-28 13:58:28 -03:00
26e35ffbaf monitoring: restart grafana to reload alerts 2026-01-28 13:53:33 -03:00
fb2feec1b5 monitoring: reuse maint-cron uid for ariadne alert 2026-01-28 13:52:12 -03:00
f94926a387 monitoring: restart grafana for ariadne alerts 2026-01-28 13:49:41 -03:00
flux-bot
ded56ccf89 chore(comms): automated image update 2026-01-28 16:49:09 +00:00
44788b3132 monitoring: alert on ariadne schedules 2026-01-28 13:47:54 -03:00
flux-bot
f913956d08 chore(maintenance): automated image update 2026-01-28 16:47:19 +00:00
cead8d3561 comms: fix MAS job indentation 2026-01-28 13:25:51 -03:00
88a4e93194 comms: tolerate MAS login rate limits 2026-01-28 13:23:25 -03:00
976b5994dd comms: stop seeding atlas bots in synapse job 2026-01-28 13:18:44 -03:00
1186722bb5 comms: inject quick/smart bot creds for MAS job 2026-01-28 13:12:02 -03:00
flux-bot
1f660f9dd5 chore(bstein-dev-home): automated image update 2026-01-28 16:07:02 +00:00
flux-bot
1c480edb47 chore(bstein-dev-home): automated image update 2026-01-28 16:06:02 +00:00
55d01ee539 comms: add atlas quick/smart bots 2026-01-28 13:01:09 -03:00
flux-bot
d56438fe06 chore(comms): automated image update 2026-01-28 15:59:05 +00:00
cfa2b4c08b monitoring: fix grafana pod annotation indent 2026-01-28 12:37:42 -03:00
flux-bot
47bce5483c chore(comms): automated image update 2026-01-28 15:35:02 +00:00
ee3543e70d monitoring: restart grafana after alert fix 2026-01-28 12:32:56 -03:00
flux-bot
a97b58bf2d chore(comms): automated image update 2026-01-28 15:32:23 +00:00
780cdd6c1b monitoring: fix postmark alert metrics 2026-01-28 12:31:33 -03:00
flux-bot
5cec8cbe32 chore(comms): automated image update 2026-01-28 15:22:49 +00:00
d9e34582ec feat: add nats platform kustomization 2026-01-28 12:15:39 -03:00
2d0e1fab34 chore: move flux sync to feature/atlasbot 2026-01-28 12:12:23 -03:00
6350a07cc5 comms: allow atlasbot to pull harbor images 2026-01-28 11:54:11 -03:00
c4ecc07e58 atlasbot: move to service image and add nats queue infra 2026-01-28 11:52:37 -03:00
76 changed files with 6991 additions and 200 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
!README.md !README.md
!knowledge/**/*.md !knowledge/**/*.md
!services/comms/knowledge/**/*.md !services/comms/knowledge/**/*.md
!services/atlasbot/knowledge/**/*.md
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
.pytest_cache .pytest_cache

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/atlasbot

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: atlasbot
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/atlasbot
targetNamespace: ai
timeout: 2m
dependsOn:
- name: ai-llm

View File

@ -13,14 +13,14 @@ spec:
git: git:
checkout: checkout:
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
commit: commit:
author: author:
email: ops@bstein.dev email: ops@bstein.dev
name: flux-bot name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update" messageTemplate: "chore(bstein-dev-home): automated image update"
push: push:
branch: feature/ariadne branch: feature/atlasbot
update: update:
strategy: Setters strategy: Setters
path: services/bstein-dev-home path: services/bstein-dev-home

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: comms
namespace: comms
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(comms): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/comms

View File

@ -6,6 +6,9 @@ resources:
- vault/kustomization.yaml - vault/kustomization.yaml
- vaultwarden/kustomization.yaml - vaultwarden/kustomization.yaml
- comms/kustomization.yaml - comms/kustomization.yaml
- comms/image-automation.yaml
- atlasbot/kustomization.yaml
- atlasbot/image-automation.yaml
- crypto/kustomization.yaml - crypto/kustomization.yaml
- monerod/kustomization.yaml - monerod/kustomization.yaml
- pegasus/kustomization.yaml - pegasus/kustomization.yaml

View File

@ -9,7 +9,7 @@ metadata:
spec: spec:
interval: 1m0s interval: 1m0s
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
secretRef: secretRef:
name: flux-system-gitea name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -16,5 +16,6 @@ resources:
- longhorn/kustomization.yaml - longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml - longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml - postgres/kustomization.yaml
- nats/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml - ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml - ../platform/vault-injector/kustomization.yaml

View File

@ -13,14 +13,14 @@ spec:
git: git:
checkout: checkout:
ref: ref:
branch: feature/ariadne branch: feature/atlasbot
commit: commit:
author: author:
email: ops@bstein.dev email: ops@bstein.dev
name: flux-bot name: flux-bot
messageTemplate: "chore(maintenance): automated image update" messageTemplate: "chore(maintenance): automated image update"
push: push:
branch: feature/ariadne branch: feature/atlasbot
update: update:
strategy: Setters strategy: Setters
path: services/maintenance path: services/maintenance

View File

@ -0,0 +1,21 @@
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: nats
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/nats
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: nats
healthChecks:
- apiVersion: apps/v1
kind: StatefulSet
name: nats
namespace: nats
wait: true

View File

@ -0,0 +1,3 @@
FROM python:3.11-slim
RUN pip install --no-cache-dir psycopg2-binary bcrypt

View File

@ -6,6 +6,7 @@ resources:
- ../modules/profiles/atlas-ha - ../modules/profiles/atlas-ha
- coredns-custom.yaml - coredns-custom.yaml
- coredns-deployment.yaml - coredns-deployment.yaml
- longhorn-node-taints.yaml
- ntp-sync-daemonset.yaml - ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml - ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,40 @@
# infrastructure/core/longhorn-node-taints.yaml
apiVersion: v1
kind: Node
metadata:
name: titan-13
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-15
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-17
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-19
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule

View File

@ -0,0 +1,10 @@
# infrastructure/longhorn/core/backup-target.yaml
apiVersion: longhorn.io/v1beta2
kind: BackupTarget
metadata:
name: default
namespace: longhorn-system
spec:
backupTargetURL: "s3://atlas-soteria@us-west-004/"
credentialSecret: longhorn-backup-b2
pollInterval: 5m0s

View File

@ -6,6 +6,7 @@ resources:
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- secretproviderclass.yaml - secretproviderclass.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- backup-target.yaml
- helmrelease.yaml - helmrelease.yaml
- longhorn-settings-ensure-job.yaml - longhorn-settings-ensure-job.yaml

View File

@ -13,9 +13,27 @@ spec:
- objectName: "harbor-pull__dockerconfigjson" - objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull" secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson" secretKey: "dockerconfigjson"
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn_backup__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects: secretObjects:
- secretName: longhorn-registry - secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson type: kubernetes.io/dockerconfigjson
data: data:
- objectName: harbor-pull__dockerconfigjson - objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson key: .dockerconfigjson
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn_backup__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nats-config
namespace: nats
labels:
app: nats
component: config
annotations:
description: "NATS JetStream configuration"
data:
nats.conf: |
jetstream {
store_dir: /data
max_mem_store: 128MB
max_file_store: 1GB
}

View File

@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- configmap.yaml
- service.yaml
- statefulset.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: nats

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
selector:
app: nats
ports:
- name: client
port: 4222
targetPort: 4222
- name: monitoring
port: 8222
targetPort: 8222

View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
serviceName: nats
replicas: 1
selector:
matchLabels:
app: nats
template:
metadata:
labels:
app: nats
spec:
containers:
- name: nats
image: nats:2.10.18
args:
- "-c"
- "/etc/nats/nats.conf"
ports:
- name: client
containerPort: 4222
- name: monitoring
containerPort: 8222
volumeMounts:
- name: config
mountPath: /etc/nats
- name: data
mountPath: /data
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: nats-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi

View File

@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
} }
NAMESPACE_CPU_WINDOW = "1m" NAMESPACE_CPU_WINDOW = "1m"
GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Cluster metadata # Cluster metadata
@ -235,13 +236,16 @@ def gpu_util_by_hostname():
def gpu_node_labels(): def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}' return (
f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
' or kube_node_labels{label_jetson="true"}'
)
def gpu_requests_by_namespace_node(scope_var): def gpu_requests_by_namespace_node(scope_var):
return ( return (
"sum by (namespace,node) (" "sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})" f"* on(node) group_left() ({gpu_node_labels()})"
")" ")"
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
total_by_node = f"sum by (node) ({requests_by_ns})" total_by_node = f"sum by (node) ({requests_by_ns})"
return ( return (
"sum by (namespace) (" "sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})" f"* on(node) group_left() ({gpu_util_by_node()})"
")" ")"
) )

View File

@ -539,9 +539,9 @@ def main() -> int:
help="Write generated files (otherwise just print a summary).", help="Write generated files (otherwise just print a summary).",
) )
ap.add_argument( ap.add_argument(
"--sync-comms", "--sync-atlasbot",
action="store_true", action="store_true",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
) )
args = ap.parse_args() args = ap.parse_args()
@ -632,10 +632,10 @@ def main() -> int:
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_comms: if args.sync_atlasbot:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
_sync_tree(out_dir, comms_dir) _sync_tree(out_dir, atlasbot_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
return 0 return 0

View File

@ -3,7 +3,7 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
labels: labels:
app: atlasbot app: atlasbot
spec: spec:
@ -18,7 +18,7 @@ spec:
annotations: annotations:
checksum/atlasbot-configmap: manual-atlasbot-101 checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms" vault.hashicorp.com/role: "ai"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
vault.hashicorp.com/agent-inject-template-turn-secret: | vault.hashicorp.com/agent-inject-template-turn-secret: |
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}} {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,6 +28,15 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: | vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: | vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -58,17 +67,17 @@ spec:
hardware: rpi5 hardware: rpi5
containers: containers:
- name: atlasbot - name: atlasbot
image: python:3.11-slim image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
command: ["/bin/sh","-c"] command: ["/bin/sh","-c"]
args: args:
- | - |
. /vault/scripts/comms_vault_env.sh . /vault/scripts/atlasbot_vault_env.sh
exec python /app/bot.py exec python -m atlasbot.main
env: env:
- name: MATRIX_BASE - name: MATRIX_BASE
value: http://othrys-synapse-matrix-synapse:8008 value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
- name: AUTH_BASE - name: AUTH_BASE
value: http://matrix-authentication-service:8080 value: http://matrix-authentication-service.comms.svc.cluster.local:8080
- name: KB_DIR - name: KB_DIR
value: /kb value: /kb
- name: VM_URL - name: VM_URL
@ -76,27 +85,61 @@ spec:
- name: ARIADNE_STATE_URL - name: ARIADNE_STATE_URL
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
- name: BOT_MENTIONS - name: BOT_MENTIONS
value: atlasbot,aatlasbot,atlas_quick,atlas_smart value: atlas-quick,atlas-smart,atlas-genius
- name: OLLAMA_URL - name: OLLAMA_URL
value: http://ollama.ai.svc.cluster.local:11434 value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL - name: OLLAMA_MODEL
value: qwen2.5:14b-instruct value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_FAST - name: ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0 value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP - name: ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_FALLBACK_MODEL - name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0 value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC - name: OLLAMA_TIMEOUT_SEC
value: "600" value: "600"
- name: ATLASBOT_THINKING_INTERVAL_SEC - name: ATLASBOT_THINKING_INTERVAL_SEC
value: "120" value: "30"
- name: ATLASBOT_SNAPSHOT_TTL_SEC - name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30" value: "30"
- name: ATLASBOT_HTTP_PORT - name: ATLASBOT_HTTP_PORT
value: "8090" value: "8090"
- name: ATLASBOT_STATE_DB
value: /data/atlasbot_state.db
- name: ATLASBOT_QUEUE_ENABLED
value: "false"
- name: ATLASBOT_DEBUG_PIPELINE
value: "true"
- name: ATLASBOT_NATS_URL
value: nats://nats.nats.svc.cluster.local:4222
- name: ATLASBOT_NATS_STREAM
value: atlasbot
- name: ATLASBOT_NATS_SUBJECT
value: atlasbot.requests
- name: ATLASBOT_FAST_MAX_ANGLES
value: "2"
- name: ATLASBOT_SMART_MAX_ANGLES
value: "5"
- name: ATLASBOT_FAST_MAX_CANDIDATES
value: "2"
- name: ATLASBOT_SMART_MAX_CANDIDATES
value: "6"
- name: ATLASBOT_FAST_LLM_CALLS_MAX
value: "24"
- name: ATLASBOT_SMART_LLM_CALLS_MAX
value: "48"
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
value: "96"
ports: ports:
- name: http - name: http
containerPort: 8090 containerPort: 8090
@ -108,19 +151,15 @@ spec:
cpu: 500m cpu: 500m
memory: 512Mi memory: 512Mi
volumeMounts: volumeMounts:
- name: code
mountPath: /app/bot.py
subPath: bot.py
- name: kb - name: kb
mountPath: /kb mountPath: /kb
readOnly: true readOnly: true
- name: vault-scripts - name: vault-scripts
mountPath: /vault/scripts mountPath: /vault/scripts
readOnly: true readOnly: true
- name: atlasbot-state
mountPath: /data
volumes: volumes:
- name: code
configMap:
name: atlasbot
- name: kb - name: kb
configMap: configMap:
name: atlas-kb name: atlas-kb
@ -139,5 +178,7 @@ spec:
path: diagrams/atlas-http.mmd path: diagrams/atlas-http.mmd
- name: vault-scripts - name: vault-scripts
configMap: configMap:
name: comms-vault-env name: atlasbot-vault-env
defaultMode: 0555 defaultMode: 0555
- name: atlasbot-state
emptyDir: {}

View File

@ -3,7 +3,9 @@ apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
imagePullSecrets:
- name: harbor-regcred
--- ---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
@ -43,5 +45,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: atlasbot name: atlasbot
namespace: comms namespace: ai

View File

@ -2,7 +2,7 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: atlasbot name: atlasbot
namespace: comms namespace: ai
labels: labels:
app: atlasbot app: atlasbot
spec: spec:

View File

@ -0,0 +1,26 @@
# services/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
name: flux-bot
email: ops@bstein.dev
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
path: services/atlasbot
strategy: Setters

View File

@ -0,0 +1,23 @@
# services/comms/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: atlasbot
namespace: ai
spec:
image: registry.bstein.dev/bstein/atlasbot
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: atlasbot
namespace: ai
spec:
imageRepositoryRef:
name: atlasbot
policy:
semver:
range: ">=0.1.0-0"

View File

@ -0,0 +1,22 @@
Atlas Knowledge Base (KB)
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
Regeneration
- Update manifests/docs, then regenerate generated artifacts:
- `python scripts/knowledge_render_atlas.py --write`
Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.

View File

@ -0,0 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,234 @@
flowchart LR
host_auth_bstein_dev["auth.bstein.dev"]
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
host_auth_bstein_dev --> svc_sso_oauth2_proxy
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
host_bstein_dev["bstein.dev"]
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
host_bstein_dev --> svc_comms_matrix_wellknown
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
wl_comms_element_call["comms/element-call (Deployment)"]
svc_comms_element_call --> wl_comms_element_call
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
host_ci_bstein_dev["ci.bstein.dev"]
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
host_ci_bstein_dev --> svc_jenkins_jenkins
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
svc_jenkins_jenkins --> wl_jenkins_jenkins
host_cloud_bstein_dev["cloud.bstein.dev"]
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
svc_comms_livekit["comms/livekit (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
host_mail_bstein_dev["mail.bstein.dev"]
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
svc_nextcloud_collabora --> wl_nextcloud_collabora
host_pegasus_bstein_dev["pegasus.bstein.dev"]
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
host_scm_bstein_dev["scm.bstein.dev"]
svc_gitea_gitea["gitea/gitea (Service)"]
host_scm_bstein_dev --> svc_gitea_gitea
wl_gitea_gitea["gitea/gitea (Deployment)"]
svc_gitea_gitea --> wl_gitea_gitea
host_secret_bstein_dev["secret.bstein.dev"]
svc_vault_vault["vault/vault (Service)"]
host_secret_bstein_dev --> svc_vault_vault
wl_vault_vault["vault/vault (StatefulSet)"]
svc_vault_vault --> wl_vault_vault
host_sso_bstein_dev["sso.bstein.dev"]
svc_sso_keycloak["sso/keycloak (Service)"]
host_sso_bstein_dev --> svc_sso_keycloak
wl_sso_keycloak["sso/keycloak (Deployment)"]
svc_sso_keycloak --> wl_sso_keycloak
host_stream_bstein_dev["stream.bstein.dev"]
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
subgraph bstein_dev_home[bstein-dev-home]
svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend
svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend
svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway
end
subgraph comms[comms]
svc_comms_matrix_wellknown
wl_comms_matrix_wellknown
svc_comms_element_call
wl_comms_element_call
svc_comms_livekit_token_service
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
svc_jellyfin_jellyfin
wl_jellyfin_jellyfin
end
subgraph jenkins[jenkins]
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
end
subgraph mailu_mailserver[mailu-mailserver]
svc_mailu_mailserver_mailu_front
end
subgraph nextcloud[nextcloud]
svc_nextcloud_nextcloud
wl_nextcloud_nextcloud
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy
svc_sso_keycloak
wl_sso_keycloak
end
subgraph vault[vault]
svc_vault_vault
wl_vault_vault
end
subgraph vaultwarden[vaultwarden]
svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden
end

View File

@ -0,0 +1,29 @@
# services/atlasbot/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- atlasbot-rbac.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- image.yaml
- image-automation.yaml
images:
- name: registry.bstein.dev/bstein/atlasbot
newTag: 0.1.2-97 # {"$imagepolicy": "ai:atlasbot:tag"}
configMapGenerator:
- name: atlasbot-vault-env
files:
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
options:
disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -0,0 +1,44 @@
#!/usr/bin/env sh
set -eu
vault_dir="/vault/secrets"
read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
export PGPASSWORD="$(read_secret synapse-db-pass)"
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"

View File

@ -0,0 +1,21 @@
# services/atlasbot/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: atlasbot-vault
namespace: ai
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "ai"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -0,0 +1,34 @@
# services/atlasbot/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: atlasbot-vault-sync
namespace: ai
spec:
replicas: 1
selector:
matchLabels:
app: atlasbot-vault-sync
template:
metadata:
labels:
app: atlasbot-vault-sync
spec:
serviceAccountName: atlasbot
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: atlasbot-vault

View File

@ -68,7 +68,11 @@ spec:
- name: AI_CHAT_TIMEOUT_SEC - name: AI_CHAT_TIMEOUT_SEC
value: "480" value: "480"
- name: AI_ATLASBOT_ENDPOINT - name: AI_ATLASBOT_ENDPOINT
value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
- name: AI_ATLASBOT_MODEL_FAST
value: qwen2.5:14b-instruct-q4_0
- name: AI_ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct
- name: AI_ATLASBOT_TIMEOUT_SEC - name: AI_ATLASBOT_TIMEOUT_SEC
value: "30" value: "30"
- name: AI_NODE_NAME - name: AI_NODE_NAME

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml - ingress.yaml
images: images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend - name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator: configMapGenerator:
- name: chat-ai-gateway - name: chat-ai-gateway
namespace: bstein-dev-home namespace: bstein-dev-home

View File

@ -13,10 +13,7 @@ resources:
- element-call-deployment.yaml - element-call-deployment.yaml
- guest-register-deployment.yaml - guest-register-deployment.yaml
- guest-register-service.yaml - guest-register-service.yaml
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- wellknown.yaml - wellknown.yaml
- atlasbot-rbac.yaml
- mas-secrets-ensure-rbac.yaml - mas-secrets-ensure-rbac.yaml
- comms-secrets-ensure-rbac.yaml - comms-secrets-ensure-rbac.yaml
- mas-db-ensure-rbac.yaml - mas-db-ensure-rbac.yaml
@ -43,7 +40,6 @@ resources:
- livekit-ingress.yaml - livekit-ingress.yaml
- livekit-middlewares.yaml - livekit-middlewares.yaml
- matrix-ingress.yaml - matrix-ingress.yaml
configMapGenerator: configMapGenerator:
- name: comms-vault-env - name: comms-vault-env
files: files:
@ -60,21 +56,8 @@ configMapGenerator:
- server.py=scripts/guest-register/server.py - server.py=scripts/guest-register/server.py
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: atlasbot
files:
- bot.py=scripts/atlasbot/bot.py
options:
disableNameSuffixHash: true
- name: othrys-element-host-config - name: othrys-element-host-config
files: files:
- 20-host-config.sh=scripts/element-host-config.sh - 20-host-config.sh=scripts/element-host-config.sh
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/comms-secrets-ensure-job.yaml # services/comms/oneoffs/comms-secrets-ensure-job.yaml
# One-off job for comms/comms-secrets-ensure-7. # One-off job for comms/comms-secrets-ensure-8.
# Purpose: comms secrets ensure 7 (see container args/env in this file). # Purpose: comms secrets ensure 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: comms-secrets-ensure-7 name: comms-secrets-ensure-8
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: true
@ -87,6 +87,9 @@ spec:
ensure_key "comms/synapse-redis" "redis-password" >/dev/null ensure_key "comms/synapse-redis" "redis-password" >/dev/null
ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")" SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/mas-local-users-ensure-job.yaml # services/comms/oneoffs/mas-local-users-ensure-job.yaml
# One-off job for comms/mas-local-users-ensure-18. # One-off job for comms/mas-local-users-ensure-19.
# Purpose: mas local users ensure 18 (see container args/env in this file). # Purpose: mas local users ensure 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: mas-local-users-ensure-18 name: mas-local-users-ensure-19
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: true
@ -27,6 +27,12 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: | vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: | vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}} {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -92,7 +98,13 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
command: command:
- /bin/sh - /bin/sh
- -c - -c
@ -225,11 +237,27 @@ spec:
}, },
timeout=30, timeout=30,
) )
if r.status_code == 429:
return False
if r.status_code != 200: if r.status_code != 200:
raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}") raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
return True
wait_for_service(MAS_ADMIN_API_BASE) wait_for_service(MAS_ADMIN_API_BASE)
token = admin_token() token = admin_token()
bot_quick = os.environ.get("BOT_USER_QUICK", "")
bot_smart = os.environ.get("BOT_USER_SMART", "")
bot_genius = os.environ.get("BOT_USER_GENIUS", "")
bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"]) ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
if bot_quick and bot_quick_pass:
ensure_user(token, bot_quick, bot_quick_pass)
if bot_smart and bot_smart_pass:
ensure_user(token, bot_smart, bot_smart_pass)
if bot_genius and bot_genius_pass:
ensure_user(token, bot_genius, bot_genius_pass)
PY PY

View File

@ -1,15 +1,15 @@
# services/comms/oneoffs/synapse-admin-ensure-job.yaml # services/comms/oneoffs/synapse-admin-ensure-job.yaml
# One-off job for comms/synapse-admin-ensure-3. # One-off job for comms/synapse-admin-ensure-15.
# Purpose: synapse admin ensure 3 (see container args/env in this file). # Purpose: synapse admin ensure 15 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: synapse-admin-ensure-3 name: synapse-admin-ensure-15
namespace: comms namespace: comms
spec: spec:
suspend: true suspend: false
backoffLimit: 0 backoffLimit: 0
ttlSecondsAfterFinished: 3600 ttlSecondsAfterFinished: 3600
template: template:
@ -32,7 +32,8 @@ spec:
values: ["arm64"] values: ["arm64"]
containers: containers:
- name: ensure - name: ensure
image: python:3.11-slim image: python:3.12-slim
imagePullPolicy: Always
env: env:
- name: VAULT_ADDR - name: VAULT_ADDR
value: http://vault.vault.svc.cluster.local:8200 value: http://vault.vault.svc.cluster.local:8200
@ -45,22 +46,20 @@ spec:
- -c - -c
- | - |
set -euo pipefail set -euo pipefail
pip install --no-cache-dir psycopg2-binary bcrypt python -m pip install --no-cache-dir psycopg2-binary
python - <<'PY' python - <<'PY'
import json import json
import os import os
import secrets
import string
import time
import urllib.error import urllib.error
import urllib.parse
import urllib.request import urllib.request
import bcrypt
import psycopg2 import psycopg2
VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
PGHOST = "postgres-service.postgres.svc.cluster.local" PGHOST = "postgres-service.postgres.svc.cluster.local"
PGPORT = 5432 PGPORT = 5432
PGDATABASE = "synapse" PGDATABASE = "synapse"
@ -113,48 +112,15 @@ spec:
with urllib.request.urlopen(req, timeout=30) as resp: with urllib.request.urlopen(req, timeout=30) as resp:
resp.read() resp.read()
def random_password(length: int = 32) -> str:
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(length))
def ensure_admin_creds(token: str) -> dict: def ensure_admin_creds(token: str) -> dict:
data = vault_get(token, "comms/synapse-admin") data = vault_get(token, "comms/synapse-admin")
username = (data.get("username") or "").strip() or "synapse-admin" username = "othrys-seeder"
password = (data.get("password") or "").strip() if data.get("username") != username:
if not password:
password = random_password()
data["username"] = username data["username"] = username
data["password"] = password data.pop("access_token", None)
vault_put(token, "comms/synapse-admin", data) vault_put(token, "comms/synapse-admin", data)
return data return data
def ensure_user(cur, cols, user_id, password, admin):
now_ms = int(time.time() * 1000)
values = {
"name": user_id,
"password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
"creation_ts": now_ms,
}
def add_flag(name, flag):
if name not in cols:
return
if cols[name]["type"] in ("smallint", "integer"):
values[name] = int(flag)
else:
values[name] = bool(flag)
add_flag("admin", admin)
add_flag("deactivated", False)
add_flag("shadow_banned", False)
add_flag("is_guest", False)
columns = list(values.keys())
placeholders = ", ".join(["%s"] * len(columns))
updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
cur.execute(query, [values[c] for c in columns])
def get_cols(cur): def get_cols(cur):
cur.execute( cur.execute(
""" """
@ -172,30 +138,40 @@ spec:
} }
return cols return cols
def ensure_access_token(cur, user_id, token_value): def admin_token_valid(token: str, user_id: str) -> bool:
cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens") if not token or not SYNAPSE_ADMIN_URL:
token_id = cur.fetchone()[0] return False
cur.execute( encoded = urllib.parse.quote(user_id, safe="")
""" url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms) req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
VALUES (%s, %s, %s, %s, NULL) try:
ON CONFLICT (token) DO NOTHING with urllib.request.urlopen(req, timeout=30) as resp:
""", resp.read()
(token_id, user_id, token_value, "ariadne-admin"), return True
) except urllib.error.HTTPError as exc:
if exc.code == 404:
return True
if exc.code in (401, 403):
return False
raise
vault_token = vault_login() vault_token = vault_login()
admin_data = ensure_admin_creds(vault_token) admin_data = ensure_admin_creds(vault_token)
if admin_data.get("access_token"): user_id = f"@{admin_data['username']}:live.bstein.dev"
log("synapse admin token already present") existing_token = admin_data.get("access_token")
if existing_token and admin_token_valid(existing_token, user_id):
log("synapse admin token already present and valid")
raise SystemExit(0) raise SystemExit(0)
if existing_token:
log("synapse admin token invalid; rotating")
admin_data.pop("access_token", None)
vault_put(vault_token, "comms/synapse-admin", admin_data)
synapse_db = vault_get(vault_token, "comms/synapse-db") synapse_db = vault_get(vault_token, "comms/synapse-db")
pg_password = synapse_db.get("POSTGRES_PASSWORD") pg_password = synapse_db.get("POSTGRES_PASSWORD")
if not pg_password: if not pg_password:
raise RuntimeError("synapse db password missing") raise RuntimeError("synapse db password missing")
user_id = f"@{admin_data['username']}:live.bstein.dev"
conn = psycopg2.connect( conn = psycopg2.connect(
host=PGHOST, host=PGHOST,
port=PGPORT, port=PGPORT,
@ -203,17 +179,34 @@ spec:
user=PGUSER, user=PGUSER,
password=pg_password, password=pg_password,
) )
token_value = secrets.token_urlsafe(32)
try: try:
with conn: with conn:
with conn.cursor() as cur: with conn.cursor() as cur:
cols = get_cols(cur) cols = get_cols(cur)
ensure_user(cur, cols, user_id, admin_data["password"], True) if "admin" not in cols:
ensure_access_token(cur, user_id, token_value) raise RuntimeError("users.admin column missing")
cur.execute(
"UPDATE users SET admin = TRUE WHERE name = %s",
(user_id,),
)
cur.execute(
"""
SELECT token FROM access_tokens
WHERE user_id = %s AND valid_until_ms IS NULL
ORDER BY id DESC LIMIT 1
""",
(user_id,),
)
row = cur.fetchone()
if not row:
raise RuntimeError(f"no access token found for {user_id}")
token_value = row[0]
finally: finally:
conn.close() conn.close()
admin_data["access_token"] = token_value admin_data["access_token"] = token_value
vault_put(vault_token, "comms/synapse-admin", admin_data) vault_put(vault_token, "comms/synapse-admin", admin_data)
if not admin_token_valid(token_value, user_id):
raise RuntimeError("synapse admin token validation failed")
log("synapse admin token stored") log("synapse admin token stored")
PY PY

View File

@ -82,8 +82,6 @@ spec:
value: synapse value: synapse
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER
value: atlasbot
command: command:
- /bin/sh - /bin/sh
- -c - -c
@ -141,10 +139,8 @@ spec:
cur.execute(query, [values[c] for c in columns]) cur.execute(query, [values[c] for c in columns])
seeder_user = os.environ["SEEDER_USER"] seeder_user = os.environ["SEEDER_USER"]
bot_user = os.environ["BOT_USER"]
server = "live.bstein.dev" server = "live.bstein.dev"
seeder_id = f"@{seeder_user}:{server}" seeder_id = f"@{seeder_user}:{server}"
bot_id = f"@{bot_user}:{server}"
conn = psycopg2.connect( conn = psycopg2.connect(
host=os.environ["PGHOST"], host=os.environ["PGHOST"],
@ -158,7 +154,6 @@ spec:
with conn.cursor() as cur: with conn.cursor() as cur:
cols = get_cols(cur) cols = get_cols(cur)
upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True) upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
finally: finally:
conn.close() conn.close()
PY PY

View File

@ -76,7 +76,7 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
command: command:
- /bin/sh - /bin/sh
- -c - -c

View File

@ -11,8 +11,12 @@ from urllib import error, parse, request
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008") BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080") AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
USER = os.environ["BOT_USER"] BOT_USER = os.environ["BOT_USER"]
PASSWORD = os.environ["BOT_PASS"] BOT_PASS = os.environ["BOT_PASS"]
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
ROOM_ALIAS = "#othrys:live.bstein.dev" ROOM_ALIAS = "#othrys:live.bstein.dev"
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
@ -31,7 +35,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "") ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "") ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -393,6 +397,31 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
return default return default
def _detect_mode(
content: dict[str, Any],
body: str,
*,
default: str = "deep",
account_user: str = "",
) -> str:
mode = _detect_mode_from_body(body, default=default)
mentions = content.get("m.mentions", {})
user_ids = mentions.get("user_ids", [])
if isinstance(user_ids, list):
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
return "fast"
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
return "deep"
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
return "deep"
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
return "fast"
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
return "deep"
return mode
def _model_for_mode(mode: str) -> str: def _model_for_mode(mode: str) -> str:
if mode == "fast" and MODEL_FAST: if mode == "fast" and MODEL_FAST:
return MODEL_FAST return MODEL_FAST
@ -416,12 +445,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
raw = resp.read() raw = resp.read()
return json.loads(raw.decode()) if raw else {} return json.loads(raw.decode()) if raw else {}
def login() -> str: def login(user: str, password: str) -> str:
login_user = normalize_user_id(USER) login_user = normalize_user_id(user)
payload = { payload = {
"type": "m.login.password", "type": "m.login.password",
"identifier": {"type": "m.id.user", "user": login_user}, "identifier": {"type": "m.id.user", "user": login_user},
"password": PASSWORD, "password": password,
} }
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE) res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
return res["access_token"] return res["access_token"]
@ -4820,7 +4849,7 @@ def open_ended_with_thinking(
thread.join(timeout=1) thread.join(timeout=1)
return result["reply"] or "Model backend is busy. Try again in a moment." return result["reply"] or "Model backend is busy. Try again in a moment."
def sync_loop(token: str, room_id: str): def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
since = None since = None
try: try:
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10) res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4861,7 +4890,7 @@ def sync_loop(token: str, room_id: str):
if not body: if not body:
continue continue
sender = ev.get("sender", "") sender = ev.get("sender", "")
if sender == f"@{USER}:live.bstein.dev": if account_user and sender == normalize_user_id(account_user):
continue continue
mentioned = is_mentioned(content, body) mentioned = is_mentioned(content, body)
@ -4874,7 +4903,12 @@ def sync_loop(token: str, room_id: str):
cleaned_body = _strip_bot_mention(body) cleaned_body = _strip_bot_mention(body)
lower_body = cleaned_body.lower() lower_body = cleaned_body.lower()
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep") mode = _detect_mode(
content,
body,
default=default_mode if default_mode in ("fast", "deep") else "deep",
account_user=account_user,
)
# Only do live cluster introspection in DMs. # Only do live cluster introspection in DMs.
allow_tools = is_dm allow_tools = is_dm
@ -4951,26 +4985,65 @@ def sync_loop(token: str, room_id: str):
history[hist_key].append(f"Atlas: {reply}") history[hist_key].append(f"Atlas: {reply}")
history[hist_key] = history[hist_key][-80:] history[hist_key] = history[hist_key][-80:]
def login_with_retry(): def login_with_retry(user: str, password: str):
last_err = None last_err = None
for attempt in range(10): for attempt in range(10):
try: try:
return login() return login(user, password)
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
last_err = exc last_err = exc
time.sleep(min(30, 2 ** attempt)) time.sleep(min(30, 2 ** attempt))
raise last_err raise last_err
def _bot_accounts() -> list[dict[str, str]]:
accounts: list[dict[str, str]] = []
def add(user: str, password: str, mode: str):
if not user or not password:
return
accounts.append({"user": user, "password": password, "mode": mode})
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "deep")
if BOT_USER_QUICK and BOT_PASS_QUICK:
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
add(BOT_USER, BOT_PASS, "deep")
seen: set[str] = set()
unique: list[dict[str, str]] = []
for acc in accounts:
uid = normalize_user_id(acc["user"]).lower()
if uid in seen:
continue
seen.add(uid)
unique.append(acc)
return unique
def main(): def main():
load_kb() load_kb()
_start_http_server() _start_http_server()
token = login_with_retry() accounts = _bot_accounts()
threads: list[threading.Thread] = []
for acc in accounts:
token = login_with_retry(acc["user"], acc["password"])
try: try:
room_id = resolve_alias(token, ROOM_ALIAS) room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id) join_room(token, room_id)
except Exception: except Exception:
room_id = None room_id = None
sync_loop(token, room_id) thread = threading.Thread(
target=sync_loop,
args=(token, room_id),
kwargs={
"account_user": acc["user"],
"default_mode": acc["mode"],
},
daemon=True,
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -7,6 +7,14 @@ read_secret() {
tr -d '\r\n' < "${vault_dir}/$1" tr -d '\r\n' < "${vault_dir}/$1"
} }
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)" export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}" export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}" export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)" export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)" export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)" export CHAT_API_KEY="$(read_secret chat-matrix)"

View File

@ -66,7 +66,7 @@ spec:
- name: SEEDER_USER - name: SEEDER_USER
value: othrys-seeder value: othrys-seeder
- name: BOT_USER - name: BOT_USER
value: atlasbot value: atlas-smart
command: command:
- /bin/sh - /bin/sh
- -c - -c

View File

@ -29,12 +29,18 @@ spec:
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi4","rpi5"]
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50 - weight: 80
preference: preference:
matchExpressions: matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4"] values: ["rpi5"]
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
containers: containers:
- name: monerod - name: monerod
image: registry.bstein.dev/crypto/monerod:0.18.4.1 image: registry.bstein.dev/crypto/monerod:0.18.4.1

View File

@ -23,7 +23,7 @@ spec:
- matchExpressions: - matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi5"]
containers: containers:
- name: xmrig - name: xmrig
image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9

View File

@ -123,13 +123,22 @@ spec:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4","rpi5"] values: ["rpi4","rpi5"]
- key: longhorn
operator: NotIn
values: ["true"]
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13","titan-15","titan-17","titan-19"]
- weight: 50 - weight: 50
preference: preference:
matchExpressions: matchExpressions:
- key: hardware - key: hardware
operator: In operator: In
values: ["rpi4"] values: ["rpi5"]
containers: containers:
- name: gitea - name: gitea
image: gitea/gitea:1.23 image: gitea/gitea:1.23

View File

@ -245,6 +245,17 @@ spec:
image: image:
repository: registry.bstein.dev/infra/harbor-registry repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"} tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
extraEnvVars:
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
controller: controller:
image: image:
repository: registry.bstein.dev/infra/harbor-registryctl repository: registry.bstein.dev/infra/harbor-registryctl
@ -263,6 +274,10 @@ spec:
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}" export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}" export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
{{ end }} {{ end }}
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry" vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: | vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }} {{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -397,10 +412,10 @@ spec:
patch: |- patch: |-
- op: replace - op: replace
path: /spec/rules/0/http/paths/2/backend/service/name path: /spec/rules/0/http/paths/2/backend/service/name
value: harbor-registry value: harbor-core
- op: replace - op: replace
path: /spec/rules/0/http/paths/2/backend/service/port/number path: /spec/rules/0/http/paths/2/backend/service/port/number
value: 5000 value: 80
- target: - target:
kind: Deployment kind: Deployment
name: harbor-jobservice name: harbor-jobservice
@ -464,6 +479,16 @@ spec:
value: /vault/secrets/harbor-registry-env.sh value: /vault/secrets/harbor-registry-env.sh
- name: VAULT_COPY_FILES - name: VAULT_COPY_FILES
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
envFrom: envFrom:
- $patch: replace - $patch: replace
volumeMounts: volumeMounts:

View File

@ -67,7 +67,7 @@ data:
url('https://scm.bstein.dev/bstein/harbor-arm-build.git') url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
credentials('gitea-pat') credentials('gitea-pat')
} }
branches('*/master') branches('*/main')
} }
} }
} }
@ -108,7 +108,7 @@ data:
url('https://scm.bstein.dev/bstein/ci-demo.git') url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat') credentials('gitea-pat')
} }
branches('*/master') branches('*/main')
} }
} }
scriptPath('Jenkinsfile') scriptPath('Jenkinsfile')
@ -167,6 +167,58 @@ data:
} }
} }
} }
pipelineJob('atlasbot') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/2 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/atlasbot.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('Soteria') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/soteria.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('data-prepper') { pipelineJob('data-prepper') {
properties { properties {
pipelineTriggers { pipelineTriggers {

View File

@ -48,7 +48,7 @@ spec:
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }} GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }} {{ end }}
bstein.dev/restarted-at: "2026-01-20T14:52:41Z" bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
spec: spec:
serviceAccountName: jenkins serviceAccountName: jenkins
nodeSelector: nodeSelector:

View File

@ -0,0 +1,13 @@
# services/jenkins/dind-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-dind-cache
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: astreae

View File

@ -8,6 +8,7 @@ resources:
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- pvc.yaml - pvc.yaml
- cache-pvc.yaml - cache-pvc.yaml
- dind-pvc.yaml
- plugins-pvc.yaml - plugins-pvc.yaml
- configmap-jcasc.yaml - configmap-jcasc.yaml
- configmap-plugins.yaml - configmap-plugins.yaml

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml # services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14. # One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file). # Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: keycloak-portal-e2e-execute-actions-email-14 name: keycloak-portal-e2e-execute-actions-email-18
namespace: sso namespace: sso
spec: spec:
suspend: true suspend: true
@ -70,7 +70,7 @@ spec:
- name: E2E_PROBE_USERNAME - name: E2E_PROBE_USERNAME
value: robotuser value: robotuser
- name: E2E_PROBE_EMAIL - name: E2E_PROBE_EMAIL
value: robotuser@bstein.dev value: brad.stein+robot@gmail.com
- name: EXECUTE_ACTIONS_CLIENT_ID - name: EXECUTE_ACTIONS_CLIENT_ID
value: bstein-dev-home value: bstein-dev-home
- name: EXECUTE_ACTIONS_REDIRECT_URI - name: EXECUTE_ACTIONS_REDIRECT_URI

View File

@ -1,12 +1,12 @@
# services/keycloak/oneoffs/realm-settings-job.yaml # services/keycloak/oneoffs/realm-settings-job.yaml
# One-off job for sso/keycloak-realm-settings-36. # One-off job for sso/keycloak-realm-settings-38.
# Purpose: keycloak realm settings 36 (see container args/env in this file). # Purpose: keycloak realm settings 38 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true. # Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously. # Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1 apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: keycloak-realm-settings-36 name: keycloak-realm-settings-38
namespace: sso namespace: sso
spec: spec:
suspend: true suspend: true
@ -64,7 +64,7 @@ spec:
- name: KEYCLOAK_REALM - name: KEYCLOAK_REALM
value: atlas value: atlas
- name: KEYCLOAK_SMTP_HOST - name: KEYCLOAK_SMTP_HOST
value: mail.bstein.dev value: smtp.postmarkapp.com
- name: KEYCLOAK_SMTP_PORT - name: KEYCLOAK_SMTP_PORT
value: "587" value: "587"
- name: KEYCLOAK_SMTP_FROM - name: KEYCLOAK_SMTP_FROM

View File

@ -18,6 +18,7 @@ spec:
prometheus.io/scrape: "true" prometheus.io/scrape: "true"
prometheus.io/port: "8080" prometheus.io/port: "8080"
prometheus.io/path: "/metrics" prometheus.io/path: "/metrics"
maintenance.bstein.dev/restart-rev: "20260207-2"
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance" vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"
containers: containers:
- name: ariadne - name: ariadne
image: registry.bstein.dev/bstein/ariadne:0.1.0-0 image: registry.bstein.dev/bstein/ariadne:latest
imagePullPolicy: Always imagePullPolicy: Always
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
@ -285,7 +286,7 @@ spec:
- name: ARIADNE_SCHEDULE_MAILU_SYNC - name: ARIADNE_SCHEDULE_MAILU_SYNC
value: "30 4 * * *" value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
value: "0 5 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
value: "*/5 * * * *" value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,11 +294,11 @@ spec:
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
value: "0 * * * *" value: "0 * * * *"
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
value: "0 5 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_WGER_ADMIN - name: ARIADNE_SCHEDULE_WGER_ADMIN
value: "15 3 * * *" value: "15 3 * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
value: "0 6 * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_CRON - name: ARIADNE_SCHEDULE_FIREFLY_CRON
value: "0 3 * * *" value: "0 3 * * *"
- name: ARIADNE_SCHEDULE_POD_CLEANER - name: ARIADNE_SCHEDULE_POD_CLEANER
@ -305,11 +306,11 @@ spec:
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
value: "23 3 * * *" value: "23 3 * * *"
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
value: "30 4 * * 0" value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
value: "0 * * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_VAULT_OIDC - name: ARIADNE_SCHEDULE_VAULT_OIDC
value: "0 * * * *" value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
value: "*/5 * * * *" value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -330,6 +331,8 @@ spec:
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
value: "5" value: "5"
- name: ARIADNE_ALERTMANAGER_URL
value: http://alertmanager.monitoring.svc.cluster.local
- name: OPENSEARCH_URL - name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200 value: http://opensearch-master.logging.svc.cluster.local:9200
- name: OPENSEARCH_LIMIT_BYTES - name: OPENSEARCH_LIMIT_BYTES

View File

@ -29,6 +29,29 @@ rules:
- get - get
- list - list
- watch - watch
- apiGroups: ["apps"]
resources:
- deployments
- statefulsets
- daemonsets
verbs:
- get
- list
- watch
- apiGroups: ["longhorn.io"]
resources:
- volumes
verbs:
- get
- list
- watch
- apiGroups: [""]
resources:
- events
verbs:
- get
- list
- watch
- apiGroups: [""] - apiGroups: [""]
resources: resources:
- pods/exec - pods/exec
@ -56,3 +79,17 @@ roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: ClusterRole kind: ClusterRole
name: ariadne-job-spawner name: ariadne-job-spawner
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: ariadne-auth-delegator
subjects:
- kind: ServiceAccount
name: ariadne
namespace: maintenance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator

View File

@ -21,3 +21,26 @@ spec:
policy: policy:
semver: semver:
range: ">=0.1.0-0" range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: soteria
namespace: maintenance
spec:
image: registry.bstein.dev/bstein/soteria
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: soteria
namespace: maintenance
spec:
imageRepositoryRef:
name: soteria
policy:
semver:
range: ">=0.1.0-0"

View File

@ -5,6 +5,7 @@ resources:
- namespace.yaml - namespace.yaml
- image.yaml - image.yaml
- secretproviderclass.yaml - secretproviderclass.yaml
- soteria-configmap.yaml
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- ariadne-serviceaccount.yaml - ariadne-serviceaccount.yaml
@ -13,9 +14,12 @@ resources:
- k3s-traefik-cleanup-rbac.yaml - k3s-traefik-cleanup-rbac.yaml
- node-nofile-serviceaccount.yaml - node-nofile-serviceaccount.yaml
- pod-cleaner-rbac.yaml - pod-cleaner-rbac.yaml
- soteria-serviceaccount.yaml
- soteria-rbac.yaml
- ariadne-deployment.yaml - ariadne-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml - oneoffs/ariadne-migrate-job.yaml
- ariadne-service.yaml - ariadne-service.yaml
- soteria-deployment.yaml
- disable-k3s-traefik-daemonset.yaml - disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml - oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml - node-nofile-daemonset.yaml
@ -24,9 +28,12 @@ resources:
- node-image-sweeper-serviceaccount.yaml - node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml - node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml - image-sweeper-cronjob.yaml
- soteria-service.yaml
images: images:
- name: registry.bstein.dev/bstein/ariadne - name: registry.bstein.dev/bstein/ariadne
newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"} newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator: configMapGenerator:
- name: disable-k3s-traefik-script - name: disable-k3s-traefik-script
namespace: maintenance namespace: maintenance

View File

@ -0,0 +1,10 @@
# services/maintenance/soteria-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: "longhorn"
SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
SOTERIA_LONGHORN_BACKUP_MODE: "incremental"

View File

@ -0,0 +1,73 @@
# services/maintenance/soteria-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: soteria
namespace: maintenance
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: soteria
template:
metadata:
labels:
app: soteria
spec:
serviceAccountName: soteria
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: soteria
image: registry.bstein.dev/bstein/soteria:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
envFrom:
- configMapRef:
name: soteria
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
periodSeconds: 5
timeoutSeconds: 2
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 65532
capabilities:
drop: ["ALL"]

View File

@ -0,0 +1,22 @@
# services/maintenance/soteria-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: soteria
rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims", "persistentvolumes"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: soteria
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: soteria
subjects:
- kind: ServiceAccount
name: soteria
namespace: maintenance

View File

@ -0,0 +1,14 @@
# services/maintenance/soteria-service.yaml
apiVersion: v1
kind: Service
metadata:
name: soteria
namespace: maintenance
spec:
type: ClusterIP
selector:
app: soteria
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,8 @@
# services/maintenance/soteria-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: soteria
namespace: maintenance
imagePullSecrets:
- name: harbor-regcred

View File

@ -20,7 +20,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -89,7 +89,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -1901,7 +1901,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -145,7 +145,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{instance}}' legendFormat: '{{instance}}'
datasource: datasource:
type: prometheus type: prometheus
@ -286,8 +286,8 @@ data:
summary: "node-image-sweeper not fully ready" summary: "node-image-sweeper not fully ready"
labels: labels:
severity: warning severity: warning
- uid: maint-cron-stale - uid: maint-ariadne-image-sweeper-stale
title: "Maintenance CronJobs stale (>3h since success)" title: "Ariadne image sweeper stale (schedule >8d)"
condition: C condition: C
for: "5m" for: "5m"
data: data:
@ -297,10 +297,10 @@ data:
to: 0 to: 0
datasourceUid: atlas-vm datasourceUid: atlas-vm
model: model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
legendFormat: '{{cronjob}}' legendFormat: '{{task}}'
datasource: datasource:
type: prometheus type: prometheus
uid: atlas-vm uid: atlas-vm
@ -321,17 +321,166 @@ data:
type: threshold type: threshold
conditions: conditions:
- evaluator: - evaluator:
params: [10800] params: [691200]
type: gt type: gt
operator: operator:
type: and type: and
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Maintenance cronjob stale >3h since last success" summary: "Ariadne image sweeper stale >8d since last success"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (legacy disabled)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: legacy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: OK
annotations:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
labels: labels:
severity: warning severity: warning
- orgId: 1 - orgId: 1
@ -352,7 +501,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} expr: postmark_outbound_bounce_rate{window="1d"}
legendFormat: bounce 1d legendFormat: bounce 1d
datasource: datasource:
type: prometheus type: prometheus
@ -400,7 +549,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: POSTMARK_API_UP expr: min_over_time(max by (instance) (postmark_api_up)[5m])
legendFormat: api up legendFormat: api up
datasource: datasource:
type: prometheus type: prometheus

View File

@ -29,7 +29,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -98,7 +98,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -1910,7 +1910,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }

View File

@ -286,7 +286,7 @@ spec:
podAnnotations: podAnnotations:
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "1" monitoring.bstein.dev/restart-rev: "4"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: | vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }} {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}

View File

@ -43,6 +43,12 @@ spec:
value: /var/run/secrets/vault-token-reviewer/token value: /var/run/secrets/vault-token-reviewer/token
- name: VAULT_K8S_ROLE_TTL - name: VAULT_K8S_ROLE_TTL
value: 1h value: 1h
- name: VAULT_K8S_BOUND_AUDIENCES
value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
- name: VAULT_K8S_ISSUER
value: https://kubernetes.default.svc.cluster.local
- name: VAULT_K8S_DISABLE_ISS_VALIDATION
value: "false"
volumeMounts: volumeMounts:
- name: k8s-auth-config-script - name: k8s-auth-config-script
mountPath: /scripts mountPath: /scripts

View File

@ -53,6 +53,8 @@ ensure_token
k8s_host="https://${KUBERNETES_SERVICE_HOST}:443" k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)" k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
k8s_issuer="${VAULT_K8S_ISSUER:-}"
disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
role_ttl="${VAULT_K8S_ROLE_TTL:-1h}" role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}" token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
vault_cmd auth enable kubernetes vault_cmd auth enable kubernetes
fi fi
ensure_default_policy_login() {
default_policy="$(vault_cmd policy read default)"
if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
return
fi
log "updating default policy to allow kubernetes login"
default_policy="${default_policy}
path \"auth/kubernetes/login\" {
capabilities = [\"create\", \"update\"]
}
"
printf '%s\n' "${default_policy}" | vault_cmd policy write default -
}
log "configuring kubernetes auth" log "configuring kubernetes auth"
vault_cmd write auth/kubernetes/config \ if [ -n "${k8s_issuer}" ]; then
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}" \
issuer="${k8s_issuer}" \
disable_iss_validation="${disable_iss_validation}"
else
vault_cmd write auth/kubernetes/config \
token_reviewer_jwt="${token_reviewer_jwt}" \ token_reviewer_jwt="${token_reviewer_jwt}" \
kubernetes_host="${k8s_host}" \ kubernetes_host="${k8s_host}" \
kubernetes_ca_cert="${k8s_ca}" kubernetes_ca_cert="${k8s_ca}"
fi
ensure_default_policy_login
write_raw_policy() { write_raw_policy() {
name="$1" name="$1"
@ -87,6 +114,7 @@ write_policy_and_role() {
service_accounts="$3" service_accounts="$3"
read_paths="$4" read_paths="$4"
write_paths="$5" write_paths="$5"
audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
policy_body="" policy_body=""
for path in ${read_paths}; do for path in ${read_paths}; do
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
} }
" "
done done
if [ "${role}" = "maintenance" ]; then
policy_body="${policy_body}
path \"sys/auth\" {
capabilities = [\"read\"]
}
path \"sys/auth/*\" {
capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
}
path \"auth/kubernetes/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"auth/oidc/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
path \"sys/policies/acl\" {
capabilities = [\"list\"]
}
path \"sys/policies/acl/*\" {
capabilities = [\"create\", \"update\", \"read\"]
}
"
fi
log "writing policy ${role}" log "writing policy ${role}"
printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" - printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
log "writing role ${role}" log "writing role ${role}"
if [ -n "${audiences}" ]; then
vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_audiences="${audiences}" \
bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \
policies="${role}" \
ttl="${role_ttl}"
return
fi
vault_cmd write "auth/kubernetes/role/${role}" \ vault_cmd write "auth/kubernetes/role/${role}" \
bound_service_account_names="${service_accounts}" \ bound_service_account_names="${service_accounts}" \
bound_service_account_namespaces="${namespace}" \ bound_service_account_namespaces="${namespace}" \
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
"nextcloud/* shared/keycloak-admin shared/postmark-relay" "" "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "ai" "ai" "atlasbot" \
"comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
"jenkins/* shared/harbor-pull" "" "jenkins/* shared/harbor-pull" ""
write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
write_policy_and_role "health" "health" "health-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \
"health/*" "" "health/*" ""
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
"maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" "" "maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
write_policy_and_role "finance" "finance" "finance-vault" \ write_policy_and_role "finance" "finance" "finance-vault" \
"finance/* shared/postmark-relay" "" "finance/* shared/postmark-relay" ""
write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \