Compare commits

..

414 Commits

Author SHA1 Message Date
6e4cafa3df maintenance: harden metis recovery and fix harbor rollout 2026-03-31 14:51:49 -03:00
41021c472b maintenance/jenkins: align Metis ingress, sentinel push, and CI job 2026-03-31 14:21:53 -03:00
17afb0bb55 maintenance: add Metis service and sentinel manifests 2026-03-31 14:07:17 -03:00
1e0e73a28f monitoring: combine Ariadne and Metis tests 2026-03-31 13:54:04 -03:00
af01a620c3 monitoring: roll grafana to apply latest alert rules 2026-03-30 18:41:21 -03:00
0edc513e2e monitoring: raise rootfs warning threshold to 85 percent 2026-03-30 18:40:59 -03:00
3659c9c07b maintenance: unblock sweeper rollouts on degraded nodes 2026-03-30 18:39:05 -03:00
11d58dccb7 maintenance: run image sweeper periodically for sd safety 2026-03-30 18:36:25 -03:00
5bcff5f405 monitoring: tame email noise and harden postmark alerts 2026-03-30 18:32:22 -03:00
f5dcea860e atlasbot: wire context and timeout fallbacks 2026-03-30 16:55:19 -03:00
a1e90f4600 atlasbot: wire quick smart genius modes 2026-03-30 16:51:23 -03:00
f04f032721 longhorn: avoid webhook deadlock and forced image pulls 2026-03-30 10:16:42 -03:00
083999c84c comms: harden matrix auth ingress routes for MAS 2026-03-30 08:21:19 -03:00
dc62a84e2e flux: keep feature branch tracking until main push is available 2026-03-30 07:57:13 -03:00
31ffaedf2a flux: target main branch for sync and image automation 2026-03-30 07:48:47 -03:00
b2d1dc4e3f flux: return sync and image automation branches to master 2026-03-30 07:48:09 -03:00
flux-bot
271a941d89 chore(atlasbot): automated image update 2026-03-30 10:47:00 +00:00
flux-bot
fa30a2cade chore(atlasbot): automated image update 2026-03-30 07:10:35 +00:00
f71d0bc3f3 atlasbot: switch quick mode to 7b fast model 2026-03-30 04:07:08 -03:00
flux-bot
19a3207eac chore(atlasbot): automated image update 2026-03-30 07:04:35 +00:00
2d5107f7e2 bstein-dev-home: deploy backend image 0.1.1-123 2026-03-30 03:54:39 -03:00
a091ea75a3 atlasbot: deploy matrix timeout fix image 0.1.2-103 2026-03-30 03:51:30 -03:00
95dabf5df8 atlasbot: disable ollama retries for strict quick budgets 2026-03-30 03:50:59 -03:00
flux-bot
311cec8adf chore(bstein-dev-home): automated image update 2026-03-30 06:46:11 +00:00
flux-bot
b18e355412 chore(atlasbot): automated image update 2026-03-30 06:45:32 +00:00
flux-bot
80057210fc chore(bstein-dev-home): automated image update 2026-03-30 06:38:10 +00:00
flux-bot
7a1e99a95e chore(bstein-dev-home): automated image update 2026-03-30 06:34:10 +00:00
flux-bot
ace86ad736 chore(bstein-dev-home): automated image update 2026-03-30 06:29:09 +00:00
flux-bot
2a4deb6dd1 chore(atlasbot): automated image update 2026-03-30 06:25:30 +00:00
flux-bot
eee5456921 chore(atlasbot): automated image update 2026-03-30 05:55:27 +00:00
f86d3a4c00 atlasbot: cap quick runtime and expose genius model to portal 2026-03-30 02:53:06 -03:00
a6b77c68f0 maintenance: grant ariadne auth-delegator 2026-02-08 09:55:20 -03:00
9599b4c975 ariadne: use vault-admin role for vault config 2026-02-07 22:34:10 -03:00
df96c06fa2 ariadne: run image sweeper daily 2026-02-07 11:11:41 -03:00
e575e6cb1e gitea: prefer rpi5 nodes 2026-02-07 11:07:02 -03:00
flux-bot
bca66c5d71 chore(maintenance): automated image update 2026-02-07 13:56:49 +00:00
b2affe091d maintenance: align vault role env 2026-02-07 10:51:20 -03:00
flux-bot
6c7f2112c2 chore(atlasbot): automated image update 2026-02-07 13:50:38 +00:00
a4874163ec infra: bias gitea/monerod placement, bump synapse ensure job 2026-02-07 10:48:48 -03:00
079f8efbb9 comms: run synapse admin ensure (admin flag) 2026-02-07 10:30:34 -03:00
95228b75ab comms: ensure synapse admin flag; ariadne vault role 2026-02-07 10:28:55 -03:00
9e75bf0b42 ariadne: accelerate schedules for alert clearing 2026-02-07 03:23:42 -03:00
b2841985ef comms: re-suspend synapse admin job 2026-02-07 03:19:42 -03:00
9553995ba5 comms: run synapse admin ensure 2026-02-07 03:16:44 -03:00
e840777668 vault: allow maintenance auth sync 2026-02-07 03:13:53 -03:00
718a1ca312 crypto: run xmrig only on rpi5 2026-02-06 23:34:31 -03:00
55f0347b70 comms: suspend synapse admin ensure 2026-02-06 20:21:01 -03:00
f77e13b2cb comms: run synapse admin ensure with python image 2026-02-06 20:13:02 -03:00
fd2b10d00d comms: run synapse admin ensure 2026-02-06 20:01:38 -03:00
4209299a40 jenkins: add dind cache pvc 2026-02-06 20:00:01 -03:00
1804ff06c6 gitea: avoid longhorn nodes 2026-02-06 19:33:55 -03:00
4b5913827d maintenance: pivot soteria to longhorn 2026-02-06 18:38:29 -03:00
80548a2e82 longhorn: add b2 backup target 2026-02-06 18:28:37 -03:00
flux-bot
29756b1e62 chore(maintenance): automated image update 2026-02-06 21:27:42 +00:00
4bc91c40f6 maintenance: restore soteria job node selector 2026-02-06 04:19:36 -03:00
1260d18cdf maintenance: pin soteria jobs to titan-24 for backup 2026-02-06 04:15:58 -03:00
47efd0be06 maintenance: pin soteria jobs to arm64 workers 2026-02-06 04:10:55 -03:00
flux-bot
fa410c8f1e chore(maintenance): automated image update 2026-02-06 07:10:04 +00:00
0ed75718c2 maintenance: remove restic init job 2026-02-06 03:50:30 -03:00
50ff59a33b maintenance: add restic init job 2026-02-06 03:48:45 -03:00
flux-bot
9d9bcd1988 chore(maintenance): automated image update 2026-02-05 18:56:27 +00:00
flux-bot
c96749bab6 chore(maintenance): automated image update 2026-02-05 18:45:20 +00:00
5e239accbd maintenance: schedule soteria on rpi workers 2026-02-05 15:30:09 -03:00
flux-bot
c50298c8fe chore(bstein-dev-home): automated image update 2026-02-05 18:24:54 +00:00
flux-bot
3fcab34b7d chore(maintenance): automated image update 2026-02-05 18:24:44 +00:00
e223ef8e76 harbor: route registry traffic via core 2026-02-05 15:23:42 -03:00
7f72683242 harbor: wire registryctl notification auth 2026-02-05 15:17:54 -03:00
eeb8475848 harbor: fix registry notification URL 2026-02-05 15:00:43 -03:00
839b79696c harbor: restore registry notifications env 2026-02-05 14:50:53 -03:00
920f146efb harbor: enable registry notifications 2026-02-05 14:44:09 -03:00
flux-bot
c2c5474bc8 chore(atlasbot): automated image update 2026-02-05 17:38:26 +00:00
flux-bot
eab7ed5cff chore(maintenance): automated image update 2026-02-05 17:04:24 +00:00
flux-bot
22eb1a1159 chore(maintenance): automated image update 2026-02-05 16:32:49 +00:00
d7c1ecd098 maintenance: move soteria image to bstein 2026-02-05 13:12:03 -03:00
flux-bot
96288c9fdd chore(atlasbot): automated image update 2026-02-05 15:58:19 +00:00
flux-bot
a71bf7d9d5 chore(atlasbot): automated image update 2026-02-05 01:26:05 +00:00
533baa6d0c atlasbot: set genius model env 2026-02-04 19:39:43 -03:00
flux-bot
cee353e305 chore(atlasbot): automated image update 2026-02-04 22:15:47 +00:00
flux-bot
436d24ea70 chore(atlasbot): automated image update 2026-02-04 21:45:45 +00:00
flux-bot
6fb80e37e8 chore(atlasbot): automated image update 2026-02-04 21:39:45 +00:00
flux-bot
132e73100f chore(atlasbot): automated image update 2026-02-04 19:08:32 +00:00
flux-bot
fe8cc40903 chore(atlasbot): automated image update 2026-02-04 18:09:26 +00:00
flux-bot
947a43e630 chore(atlasbot): automated image update 2026-02-04 18:03:26 +00:00
flux-bot
31679b59f5 chore(atlasbot): automated image update 2026-02-04 17:56:26 +00:00
flux-bot
77b81e1e9a chore(atlasbot): automated image update 2026-02-04 17:49:23 +00:00
flux-bot
6523e45b3f chore(atlasbot): automated image update 2026-02-04 17:30:22 +00:00
flux-bot
49414c6cca chore(atlasbot): automated image update 2026-02-04 17:23:23 +00:00
flux-bot
6efa280e9d chore(atlasbot): automated image update 2026-02-04 17:20:23 +00:00
flux-bot
ff81cfdb82 chore(atlasbot): automated image update 2026-02-04 17:14:21 +00:00
flux-bot
c4b0250321 chore(atlasbot): automated image update 2026-02-04 17:07:21 +00:00
flux-bot
c1a8aa43d6 chore(atlasbot): automated image update 2026-02-04 17:00:21 +00:00
flux-bot
0275adb5b7 chore(atlasbot): automated image update 2026-02-04 16:53:20 +00:00
flux-bot
663143660b chore(atlasbot): automated image update 2026-02-04 16:45:19 +00:00
flux-bot
cb25cf7571 chore(atlasbot): automated image update 2026-02-04 16:39:18 +00:00
flux-bot
33127dde26 chore(atlasbot): automated image update 2026-02-04 14:03:05 +00:00
flux-bot
dc214cee79 chore(atlasbot): automated image update 2026-02-04 03:27:09 +00:00
flux-bot
4395986b0c chore(atlasbot): automated image update 2026-02-04 03:01:07 +00:00
flux-bot
fba7fe9029 chore(atlasbot): automated image update 2026-02-04 02:54:06 +00:00
flux-bot
8ecc8dd548 chore(atlasbot): automated image update 2026-02-04 02:42:05 +00:00
flux-bot
672a559e52 chore(atlasbot): automated image update 2026-02-04 02:30:04 +00:00
flux-bot
0dedf4083e chore(atlasbot): automated image update 2026-02-04 01:54:01 +00:00
flux-bot
bf8b99e365 chore(maintenance): automated image update 2026-02-04 01:51:59 +00:00
flux-bot
a33ad1c073 chore(atlasbot): automated image update 2026-02-04 01:27:59 +00:00
flux-bot
be90638fac chore(atlasbot): automated image update 2026-02-04 01:09:57 +00:00
flux-bot
3bc6d29f54 chore(atlasbot): automated image update 2026-02-04 00:55:56 +00:00
flux-bot
4e88c55e57 chore(atlasbot): automated image update 2026-02-04 00:42:56 +00:00
flux-bot
b8c94d5870 chore(atlasbot): automated image update 2026-02-04 00:37:55 +00:00
flux-bot
7f83d2f936 chore(atlasbot): automated image update 2026-02-04 00:34:55 +00:00
flux-bot
d42aa42d8a chore(atlasbot): automated image update 2026-02-04 00:19:53 +00:00
flux-bot
86f512fa1a chore(atlasbot): automated image update 2026-02-03 22:41:45 +00:00
flux-bot
16e2b19ea9 chore(atlasbot): automated image update 2026-02-03 22:06:41 +00:00
flux-bot
a1cb07c6d6 chore(atlasbot): automated image update 2026-02-03 20:18:32 +00:00
flux-bot
558d24ad6b chore(atlasbot): automated image update 2026-02-03 19:56:31 +00:00
flux-bot
160218a4ae chore(atlasbot): automated image update 2026-02-03 19:29:28 +00:00
flux-bot
2e361e620e chore(atlasbot): automated image update 2026-02-03 18:04:21 +00:00
flux-bot
fcd0ea9872 chore(atlasbot): automated image update 2026-02-03 17:53:20 +00:00
flux-bot
75826b0e5e chore(atlasbot): automated image update 2026-02-03 17:42:19 +00:00
flux-bot
71ddd03899 chore(atlasbot): automated image update 2026-02-03 17:34:18 +00:00
flux-bot
2d3a0b0184 chore(atlasbot): automated image update 2026-02-03 17:16:17 +00:00
flux-bot
c7fb848a62 chore(atlasbot): automated image update 2026-02-03 15:15:07 +00:00
flux-bot
c643c965b8 chore(atlasbot): automated image update 2026-02-03 15:05:06 +00:00
flux-bot
618be5ce01 chore(atlasbot): automated image update 2026-02-03 14:57:06 +00:00
flux-bot
ac049e6bb9 chore(atlasbot): automated image update 2026-02-03 14:51:05 +00:00
flux-bot
50108afc57 chore(atlasbot): automated image update 2026-02-03 14:40:04 +00:00
flux-bot
1f74a29445 chore(atlasbot): automated image update 2026-02-03 14:15:01 +00:00
flux-bot
08bc5f4b82 chore(atlasbot): automated image update 2026-02-03 14:07:01 +00:00
flux-bot
c208314506 chore(atlasbot): automated image update 2026-02-03 13:43:59 +00:00
flux-bot
763e5ff9e9 chore(atlasbot): automated image update 2026-02-03 13:22:57 +00:00
flux-bot
5ecb42cfef chore(atlasbot): automated image update 2026-02-03 13:08:56 +00:00
flux-bot
102d8e56ff chore(atlasbot): automated image update 2026-02-03 13:04:56 +00:00
flux-bot
ac96c5482f chore(atlasbot): automated image update 2026-02-03 12:56:55 +00:00
flux-bot
71aa60c696 chore(atlasbot): automated image update 2026-02-03 12:32:53 +00:00
flux-bot
d7582da21b chore(atlasbot): automated image update 2026-02-03 07:33:28 +00:00
flux-bot
4bf3773eb3 chore(atlasbot): automated image update 2026-02-03 06:31:22 +00:00
flux-bot
895ea49dc5 chore(atlasbot): automated image update 2026-02-03 06:07:21 +00:00
flux-bot
f355f6dd6a chore(atlasbot): automated image update 2026-02-03 04:57:14 +00:00
9f87e61f4a atlasbot: raise llm call caps 2026-02-03 01:55:21 -03:00
flux-bot
9a2890c45c chore(atlasbot): automated image update 2026-02-03 03:29:07 +00:00
flux-bot
ad74a45e76 chore(atlasbot): automated image update 2026-02-03 03:26:07 +00:00
fda4860d67 jenkins(atlasbot): set main branch 2026-02-02 23:12:13 -03:00
9f8a0f94d2 jenkins(atlasbot): use main branch 2026-02-02 23:10:42 -03:00
51d12791ca jenkins(atlasbot): track main branch 2026-02-02 22:25:56 -03:00
9fb36f23cd ci(atlasbot): add Jenkins job and image automation 2026-02-02 20:25:47 -03:00
flux-bot
1a2fe05808 chore(atlasbot): automated image update 2026-02-02 21:04:06 +00:00
flux-bot
0c5ec895ee chore(atlasbot): automated image update 2026-02-02 20:22:02 +00:00
7c87e177e9 vault: add default k8s audience 2026-02-02 17:15:35 -03:00
flux-bot
5e6d2a938f chore(atlasbot): automated image update 2026-02-02 20:08:02 +00:00
flux-bot
09070c2cc6 chore(atlasbot): automated image update 2026-02-02 19:53:00 +00:00
flux-bot
5dd30d8802 chore(atlasbot): automated image update 2026-02-02 18:13:52 +00:00
flux-bot
f302cb2448 chore(atlasbot): automated image update 2026-02-02 18:04:51 +00:00
c0a231fd91 atlasbot: bump image to 0.1.0-133 2026-02-02 14:58:38 -03:00
flux-bot
87f8a6d2c0 chore(atlasbot): automated image update 2026-02-02 17:56:53 +00:00
flux-bot
78a0867215 chore(atlasbot): automated image update 2026-02-02 17:56:48 +00:00
b0da9080c7 atlasbot: bump image to 0.1.0-132 2026-02-02 14:56:24 -03:00
8e3feeeaac atlasbot: bump image to 0.1.0-131 2026-02-02 14:54:36 -03:00
6f2ecdb364 atlasbot: bump image to 0.1.0-130 2026-02-02 14:48:34 -03:00
a5e168e55f atlasbot: bump image to 0.1.0-129 2026-02-02 14:41:22 -03:00
flux-bot
87dc1209b1 chore(atlasbot): automated image update 2026-02-02 17:32:49 +00:00
f86845053e atlasbot: disable queue for testing 2026-02-02 14:24:09 -03:00
flux-bot
c04c5ab048 chore(atlasbot): automated image update 2026-02-02 17:13:47 +00:00
flux-bot
ec3bdb7225 chore(atlasbot): automated image update 2026-02-02 16:55:46 +00:00
flux-bot
4b68809bb9 chore(atlasbot): automated image update 2026-02-02 16:45:45 +00:00
flux-bot
661bc6ac7d chore(atlasbot): automated image update 2026-02-02 16:38:44 +00:00
a9ee943344 atlasbot: bump image to 0.1.0-123 2026-02-02 13:30:34 -03:00
826df7d960 atlasbot: bump image to 0.1.0-122 2026-02-02 13:21:28 -03:00
flux-bot
8dfe124212 chore(atlasbot): automated image update 2026-02-02 16:10:42 +00:00
flux-bot
a3bef857f9 chore(atlasbot): automated image update 2026-02-02 15:57:41 +00:00
flux-bot
ed766d7a02 chore(atlasbot): automated image update 2026-02-02 15:47:40 +00:00
4295913056 atlasbot: bump image to 0.1.0-118 2026-02-02 12:39:24 -03:00
flux-bot
e3dfa2c0ea chore(atlasbot): automated image update 2026-02-02 15:20:38 +00:00
flux-bot
6bf8181677 chore(atlasbot): automated image update 2026-02-02 15:17:37 +00:00
d67f3d6fca jenkins: reload jcasc for soteria 2026-02-02 12:11:07 -03:00
flux-bot
41a0363fbc chore(atlasbot): automated image update 2026-02-02 15:09:37 +00:00
a609e230f2 atlasbot: bump image to 0.1.0-114 2026-02-02 12:05:58 -03:00
flux-bot
37342bfe4a chore(atlasbot): automated image update 2026-02-02 15:01:36 +00:00
a509354067 atlasbot: bump image to 0.1.0-112 2026-02-02 11:52:59 -03:00
flux-bot
fb14516674 chore(atlasbot): automated image update 2026-02-02 14:49:35 +00:00
60c80cc86f atlasbot: bump image to 0.1.0-110 2026-02-02 11:42:03 -03:00
flux-bot
7b8ea36554 chore(atlasbot): automated image update 2026-02-02 14:36:35 +00:00
49224375a0 atlasbot: bump image to 0.1.0-108 2026-02-02 11:23:53 -03:00
7d7ddd52dc atlasbot: bump image to 0.1.0-107 2026-02-02 11:14:54 -03:00
cd7043c7f1 jenkins: add soteria pipeline job 2026-02-02 11:01:22 -03:00
fb82a038e9 atlasbot: bump image to 0.1.0-106 2026-02-02 11:00:18 -03:00
93bcea5893 add ai harbor regcred sync 2026-02-02 10:08:46 -03:00
0ba8578416 bump atlasbot image 2026-02-02 10:05:06 -03:00
86475b8bdf track atlasbot knowledge index 2026-02-02 09:48:40 -03:00
f19eaf3b6b move atlasbot to ai namespace 2026-02-02 09:46:50 -03:00
flux-bot
e537180f1f chore(comms): automated image update 2026-02-02 06:03:16 +00:00
flux-bot
8298ed5c16 chore(comms): automated image update 2026-02-02 05:59:16 +00:00
flux-bot
152a28bd09 chore(comms): automated image update 2026-02-02 05:59:04 +00:00
7e02cccbe8 comms: bump atlasbot to 0.1.0-103 2026-02-02 02:58:44 -03:00
flux-bot
e60b1594c0 chore(comms): automated image update 2026-02-02 05:49:15 +00:00
flux-bot
87b2b37918 chore(comms): automated image update 2026-02-02 05:46:15 +00:00
flux-bot
a1249b3e00 chore(comms): automated image update 2026-02-02 05:45:54 +00:00
5000d1f76b comms: bump atlasbot to 0.1.0-101 2026-02-02 02:45:33 -03:00
flux-bot
584625b893 chore(comms): automated image update 2026-02-02 05:39:14 +00:00
95f4ecc4e0 comms: bump atlasbot to 0.1.0-99 2026-02-02 02:16:31 -03:00
240e04f9a2 comms: bump atlasbot to 0.1.0-98 2026-02-02 02:09:57 -03:00
449b8fed64 comms: bump atlasbot to 0.1.0-97 2026-02-02 02:03:50 -03:00
flux-bot
f6d655bb0c chore(comms): automated image update 2026-02-02 05:02:11 +00:00
4fa1b6e84c comms: bump atlasbot to 0.1.0-96 2026-02-02 01:57:58 -03:00
168efd78f7 comms: bump atlasbot to 0.1.0-95 2026-02-02 01:54:41 -03:00
e0bd11fa57 comms: bump atlasbot to 0.1.0-94 2026-02-02 01:45:52 -03:00
3f43299c92 comms: bump atlasbot to 0.1.0-93 2026-02-02 01:38:59 -03:00
645790f404 comms: bump atlasbot to 0.1.0-92 2026-02-01 18:46:01 -03:00
f11f6a4e62 comms: bump atlasbot to 0.1.0-91 2026-02-01 18:42:00 -03:00
flux-bot
c559253a31 chore(comms): automated image update 2026-02-01 21:37:32 +00:00
flux-bot
a3619ce215 chore(comms): automated image update 2026-02-01 21:33:32 +00:00
flux-bot
398fb7b797 chore(comms): automated image update 2026-02-01 21:25:31 +00:00
b30e6af95d comms: bump atlasbot to 0.1.0-87 2026-02-01 18:05:00 -03:00
flux-bot
4fd79b4708 chore(comms): automated image update 2026-02-01 20:55:29 +00:00
f23da3aea5 comms: bump atlasbot to 0.1.0-85 2026-02-01 17:48:24 -03:00
flux-bot
d951ae5061 chore(comms): automated image update 2026-02-01 20:43:28 +00:00
dfe9916e91 comms: bump atlasbot to 0.1.0-83 2026-02-01 14:45:58 -03:00
flux-bot
036c758547 chore(comms): automated image update 2026-02-01 17:39:12 +00:00
382a6e49ee comms: bump atlasbot to 0.1.0-81 2026-02-01 14:34:43 -03:00
93e7449509 comms: bump atlasbot to 0.1.0-80 2026-02-01 14:28:34 -03:00
58d1c168ff comms: bump atlasbot to 0.1.0-79 2026-02-01 14:07:57 -03:00
flux-bot
889400cdbf chore(comms): automated image update 2026-02-01 15:41:02 +00:00
flux-bot
e06066a327 chore(comms): automated image update 2026-02-01 15:36:01 +00:00
138f8c4407 comms: bump atlasbot image 2026-02-01 12:25:31 -03:00
33569aff99 vault: fix k8s auth env indent 2026-02-01 12:20:04 -03:00
3e2f56da7d vault: set kubernetes issuer 2026-02-01 12:18:57 -03:00
flux-bot
0914ba3509 chore(comms): automated image update 2026-02-01 15:01:58 +00:00
flux-bot
865a979424 chore(comms): automated image update 2026-02-01 14:55:58 +00:00
flux-bot
5dfc3ed259 chore(comms): automated image update 2026-02-01 14:55:52 +00:00
b479364017 comms: bump atlasbot image 2026-02-01 11:55:26 -03:00
flux-bot
00d8f852a3 chore(comms): automated image update 2026-02-01 14:47:57 +00:00
flux-bot
2d7f744284 chore(comms): automated image update 2026-02-01 14:18:55 +00:00
5f1b1a6cd0 vault: set k8s auth audiences 2026-02-01 11:17:02 -03:00
flux-bot
e966961dbe chore(comms): automated image update 2026-02-01 13:58:53 +00:00
7ffb0aba5d atlasbot: bump to 0.1.0-70 2026-02-01 10:37:29 -03:00
flux-bot
e80a439725 chore(comms): automated image update 2026-02-01 08:40:26 +00:00
flux-bot
8a22825796 chore(comms): automated image update 2026-02-01 08:40:09 +00:00
1fabd4ce2f atlasbot: bump to 0.1.0-69 2026-02-01 05:39:54 -03:00
759ac5ef90 comms: bump atlasbot image 2026-02-01 05:31:07 -03:00
flux-bot
bc971cce92 chore(comms): automated image update 2026-02-01 08:23:24 +00:00
flux-bot
069f6b4983 chore(comms): automated image update 2026-02-01 08:18:24 +00:00
64cfd5180d comms: bump atlasbot image 2026-02-01 05:12:59 -03:00
flux-bot
8a087fb16d chore(comms): automated image update 2026-02-01 08:10:23 +00:00
flux-bot
652c3a28a3 chore(comms): automated image update 2026-02-01 07:55:22 +00:00
flux-bot
141c54ccf3 chore(comms): automated image update 2026-02-01 07:49:21 +00:00
flux-bot
0f8529c7c5 chore(comms): automated image update 2026-02-01 07:46:21 +00:00
flux-bot
dafba36768 chore(comms): automated image update 2026-02-01 07:38:20 +00:00
4d5e9552e3 comms: bump atlasbot to 0.1.0-59 2026-02-01 04:32:01 -03:00
ddf1d41fd3 comms: bump atlasbot to 0.1.0-58 2026-02-01 04:25:12 -03:00
flux-bot
49e630f7fd chore(comms): automated image update 2026-02-01 07:17:18 +00:00
flux-bot
b7a81d28d1 chore(comms): automated image update 2026-02-01 06:39:16 +00:00
109c00bc3c comms: bump atlasbot to 0.1.0-55 2026-02-01 02:08:54 -03:00
flux-bot
c9ad055b4c chore(comms): automated image update 2026-02-01 05:07:08 +00:00
10498c659b comms: bump atlasbot to 0.1.0-54 2026-02-01 01:51:26 -03:00
flux-bot
978bd8e595 chore(comms): automated image update 2026-02-01 04:51:06 +00:00
259552ac28 comms: bump atlasbot to 0.1.0-53 2026-02-01 01:39:09 -03:00
flux-bot
7f2ded5244 chore(comms): automated image update 2026-02-01 04:39:05 +00:00
e4c370b983 comms: bump atlasbot to 0.1.0-52 2026-02-01 01:29:30 -03:00
flux-bot
7dfc98b6d6 chore(comms): automated image update 2026-02-01 04:29:04 +00:00
cb60c64bce comms: bump atlasbot to 0.1.0-51 2026-02-01 01:15:18 -03:00
flux-bot
091f095893 chore(comms): automated image update 2026-02-01 04:15:03 +00:00
5b389d12df comms(atlasbot): bump image to 0.1.0-50 2026-01-31 22:30:04 -03:00
flux-bot
ae88bc8484 chore(comms): automated image update 2026-02-01 01:28:49 +00:00
529576e082 comms: bump atlasbot image 2026-01-31 21:40:11 -03:00
flux-bot
a7ffaa3213 chore(maintenance): automated image update 2026-02-01 00:39:49 +00:00
flux-bot
e478f1c74d chore(comms): automated image update 2026-02-01 00:39:45 +00:00
2480b6cecc comms: disable atlasbot queue for tests 2026-01-31 18:21:39 -03:00
bbe27f963d comms: bump atlasbot to 0.1.0-48 2026-01-31 18:14:55 -03:00
flux-bot
c5da854cef chore(comms): automated image update 2026-01-31 21:14:27 +00:00
0319707fff atlasbot: make node counts explicit 2026-01-31 16:44:50 -03:00
4f8d8f1f25 atlasbot: prioritize high-priority subquestions 2026-01-31 16:38:54 -03:00
5448ff3f55 atlasbot: expand chunk summaries 2026-01-31 16:35:02 -03:00
b6c2d1416e atlasbot: enable debug pipeline logging 2026-01-31 16:30:05 -03:00
flux-bot
152e1d88f4 chore(comms): automated image update 2026-01-31 19:29:18 +00:00
86e9dc289f atlasbot: bump to 0.1.0-43 2026-01-31 14:24:13 -03:00
flux-bot
c4b7198c46 chore(comms): automated image update 2026-01-31 17:21:08 +00:00
f8a12be2ec atlasbot: bump image to 0.1.0-42 2026-01-31 14:15:41 -03:00
flux-bot
c9ec5126cd chore(comms): automated image update 2026-01-31 17:15:07 +00:00
flux-bot
c66db7c18f chore(maintenance): automated image update 2026-01-31 16:42:06 +00:00
flux-bot
de47ab76a5 chore(maintenance): automated image update 2026-01-31 16:39:06 +00:00
c788512d59 atlasbot: bump image to 0.1.0-41 2026-01-31 13:26:44 -03:00
flux-bot
ae25ccb6f2 chore(comms): automated image update 2026-01-31 16:25:03 +00:00
flux-bot
e27f4cfc68 chore(comms): automated image update 2026-01-31 11:08:36 +00:00
50e06b4a13 atlasbot: bump image to 0.1.0-40 2026-01-31 08:08:21 -03:00
934d6e7a3b comms: fix atlasbot image indentation 2026-01-31 07:17:58 -03:00
flux-bot
25654a731e chore(comms): automated image update 2026-01-31 10:12:32 +00:00
4aecadb3de atlasbot: bump image to 0.1.0-39 2026-01-31 07:11:56 -03:00
3b79a82c71 atlasbot: bump image to 0.1.0-38 2026-01-31 06:18:58 -03:00
flux-bot
04b263dc2d chore(comms): automated image update 2026-01-31 09:18:28 +00:00
93841d9de7 maintenance: add soteria service 2026-01-31 03:35:39 -03:00
bb294c6d21 atlasbot: bump image to 0.1.0-37 2026-01-31 03:20:44 -03:00
flux-bot
64962f8863 chore(comms): automated image update 2026-01-31 06:20:12 +00:00
bcb4c05b14 ariadne: add alertmanager url 2026-01-30 21:57:05 -03:00
flux-bot
d00a09fb58 chore(maintenance): automated image update 2026-01-31 00:54:47 +00:00
flux-bot
a22ff047f7 chore(maintenance): automated image update 2026-01-31 00:40:46 +00:00
flux-bot
fef5d7d26a chore(maintenance): automated image update 2026-01-30 23:54:41 +00:00
fa60fa124c comms: suspend mas-local-users-ensure 2026-01-30 17:46:46 -03:00
30c1192978 comms: bump mas-local-users-ensure job 2026-01-30 17:44:42 -03:00
644be2c575 comms: bump comms-secrets-ensure job 2026-01-30 17:42:28 -03:00
29d1bf9f4e comms: run mas-local-users-ensure job (retry) 2026-01-30 17:37:42 -03:00
9bdab331b6 comms: suspend mas-local-users-ensure job 2026-01-30 17:33:55 -03:00
8f49ac2d63 comms: run mas-local-users-ensure job 2026-01-30 17:29:29 -03:00
flux-bot
43b9cd27ed chore(maintenance): automated image update 2026-01-30 20:18:24 +00:00
580ac4950b comms: add atlas-genius bot 2026-01-30 17:17:59 -03:00
flux-bot
d677e83423 chore(comms): automated image update 2026-01-30 20:07:20 +00:00
flux-bot
bff55a6dc7 chore(bstein-dev-home): automated image update 2026-01-30 20:05:30 +00:00
flux-bot
0465658ba7 chore(bstein-dev-home): automated image update 2026-01-30 20:02:30 +00:00
flux-bot
3e484ba726 chore(comms): automated image update 2026-01-30 19:53:19 +00:00
flux-bot
088bb3b435 chore(comms): automated image update 2026-01-30 19:42:22 +00:00
flux-bot
e81bad9d47 chore(maintenance): automated image update 2026-01-30 13:21:48 +00:00
3f11a065a3 atlasbot: support quick/smart Matrix accounts 2026-01-30 10:21:07 -03:00
flux-bot
ec6375f31d chore(maintenance): automated image update 2026-01-30 05:19:07 +00:00
flux-bot
5a8360ed97 chore(maintenance): automated image update 2026-01-30 03:15:56 +00:00
flux-bot
9e75f82d43 chore(comms): automated image update 2026-01-29 23:54:42 +00:00
flux-bot
7ac26eb0dd chore(maintenance): automated image update 2026-01-29 19:56:19 +00:00
00d2f6a61f comms: bump atlasbot to 0.1.0-32 2026-01-29 16:51:43 -03:00
flux-bot
687ca2c22d chore(comms): automated image update 2026-01-29 19:50:22 +00:00
52281ca2ec comms: bump atlasbot to 0.1.0-31 2026-01-29 16:09:15 -03:00
flux-bot
8850e9fdf1 chore(comms): automated image update 2026-01-29 19:08:18 +00:00
a253993451 comms: bump atlasbot to 0.1.0-30 2026-01-29 14:56:59 -03:00
flux-bot
aeff2bbe73 chore(comms): automated image update 2026-01-29 17:55:12 +00:00
39616b2435 comms: bump atlasbot 0.1.0-29 2026-01-29 14:18:51 -03:00
flux-bot
b3d8674499 chore(maintenance): automated image update 2026-01-29 16:43:04 +00:00
3ca0fb352d sso: suspend execute-actions email test job 2026-01-29 13:41:41 -03:00
f7ea7d57e9 sso: send execute-actions email to robotuser 2026-01-29 13:40:45 -03:00
flux-bot
a418844f61 chore(maintenance): automated image update 2026-01-29 16:35:03 +00:00
96d914d02c comms: bump atlasbot to 0.1.0-28 2026-01-29 13:33:39 -03:00
e6c031829a sso: suspend keycloak oneoff jobs 2026-01-29 13:30:10 -03:00
ebfb19c34e sso: rerun execute-actions email test 2026-01-29 13:28:32 -03:00
4fedec3999 sso: set keycloak smtp to postmark 2026-01-29 13:27:28 -03:00
55f78f2eb7 sso: rerun execute-actions email test 2026-01-29 13:23:59 -03:00
ab5ef933d8 sso: run keycloak execute-actions email test 2026-01-29 13:21:40 -03:00
3e23109229 sso: suspend realm settings job 2026-01-29 13:20:11 -03:00
d18c06ad31 sso: rerun keycloak realm settings 2026-01-29 13:10:31 -03:00
292a6b7e04 monitoring: stabilize alert queries 2026-01-29 13:07:55 -03:00
flux-bot
d7fd5682f3 chore(maintenance): automated image update 2026-01-29 16:07:01 +00:00
bedab04b22 atlasbot: bump to 0.1.0-27 2026-01-29 13:06:37 -03:00
6d7a32ce11 atlasbot: align to installed qwen model 2026-01-29 10:25:57 -03:00
87ded58aca atlasbot: align models and bump image 2026-01-29 10:17:38 -03:00
flux-bot
5f30ab73bf chore(comms): automated image update 2026-01-29 13:16:50 +00:00
flux-bot
3f2d2e5fdb chore(maintenance): automated image update 2026-01-29 13:16:46 +00:00
flux-bot
f55e9a6043 chore(comms): automated image update 2026-01-29 12:23:45 +00:00
flux-bot
7de15db57a chore(comms): automated image update 2026-01-29 11:47:42 +00:00
flux-bot
265f809f8f chore(maintenance): automated image update 2026-01-29 11:43:38 +00:00
flux-bot
e4d19fc5b4 chore(comms): automated image update 2026-01-29 11:42:41 +00:00
flux-bot
d10eace338 chore(maintenance): automated image update 2026-01-29 10:45:37 +00:00
78afc97db2 atlasbot: bump image and allow longhorn read 2026-01-29 07:45:24 -03:00
flux-bot
3c0d4d0f4f chore(comms): automated image update 2026-01-29 10:44:37 +00:00
flux-bot
d73d6d7c01 chore(comms): automated image update 2026-01-29 09:21:30 +00:00
flux-bot
af02ee7abf chore(comms): automated image update 2026-01-29 09:16:59 +00:00
630a596cb6 atlasbot: bump image tag 2026-01-29 06:16:43 -03:00
flux-bot
d2729138b6 chore(maintenance): automated image update 2026-01-29 09:12:26 +00:00
a6fbcc8669 maintenance(ariadne): allow apps/events, bump image tag 2026-01-29 06:09:36 -03:00
flux-bot
d91d632496 chore(maintenance): automated image update 2026-01-29 09:01:41 +00:00
flux-bot
3a9949a24d chore(comms): automated image update 2026-01-29 08:01:25 +00:00
b045506516 vault: allow kubernetes auth login 2026-01-29 02:22:51 -03:00
flux-bot
3f24de03d1 chore(maintenance): automated image update 2026-01-29 04:58:20 +00:00
flux-bot
a3ffcb2ea1 chore(comms): automated image update 2026-01-29 04:58:10 +00:00
flux-bot
314a922109 chore(comms): automated image update 2026-01-29 04:56:21 +00:00
flux-bot
2ed4762fab chore(maintenance): automated image update 2026-01-29 04:56:05 +00:00
1c6d572559 images: bump ariadne and atlasbot 2026-01-29 01:55:07 -03:00
flux-bot
58cc15a7e0 chore(comms): automated image update 2026-01-29 01:35:52 +00:00
flux-bot
3da28531fd chore(maintenance): automated image update 2026-01-29 01:35:03 +00:00
flux-bot
58f818cebc chore(maintenance): automated image update 2026-01-28 23:47:54 +00:00
flux-bot
cff7ec922e chore(comms): automated image update 2026-01-28 23:46:43 +00:00
flux-bot
a49f0580da chore(maintenance): automated image update 2026-01-28 23:43:54 +00:00
flux-bot
10d4f015b2 chore(maintenance): automated image update 2026-01-28 23:36:54 +00:00
flux-bot
669849b883 chore(maintenance): automated image update 2026-01-28 23:31:53 +00:00
flux-bot
9ce9470677 chore(comms): automated image update 2026-01-28 22:59:41 +00:00
c3555d59f7 monitoring: fix GPU share attribution 2026-01-28 19:08:53 -03:00
28af553498 monitoring: de-dupe ariadne schedule alert 2026-01-28 18:45:15 -03:00
d42385de3e comms: suspend synapse admin ensure job 2026-01-28 18:39:28 -03:00
6104035474 maintenance: restart ariadne after synapse token update 2026-01-28 18:37:49 -03:00
dabf043ce6 comms: force admin token to use othrys-seeder 2026-01-28 18:35:28 -03:00
9b8ef436c8 comms: fix vault_put indentation 2026-01-28 18:31:48 -03:00
8cf24a6c96 comms: source admin token from seeder access tokens 2026-01-28 18:29:49 -03:00
2797464b45 comms: mint synapse admin token with syt_ prefix 2026-01-28 18:20:37 -03:00
320cf901ba comms: rerun synapse admin ensure with device 2026-01-28 18:17:24 -03:00
5bb0fc126e comms: ensure synapse device for admin token 2026-01-28 18:10:55 -03:00
1b8271ed61 maintenance: restart ariadne after synapse token 2026-01-28 17:59:25 -03:00
fab030e9c0 comms: rotate invalid synapse admin token 2026-01-28 17:57:39 -03:00
be6b65cedb comms: rerun synapse admin ensure job 2026-01-28 17:54:53 -03:00
cbed39bd64 comms: run synapse admin ensure job 2026-01-28 17:50:01 -03:00
445622e936 comms: use bundled synapse admin ensure image 2026-01-28 17:47:58 -03:00
17e28d2891 maintenance: restart ariadne to reload secrets 2026-01-28 17:31:25 -03:00
8325827c41 comms: suspend synapse admin ensure job 2026-01-28 17:29:07 -03:00
7c7ed38ead comms: fix synapse admin ensure vault login 2026-01-28 17:27:39 -03:00
5d2fb32ff8 comms: rebuild synapse admin ensure job 2026-01-28 17:25:34 -03:00
flux-bot
b62a5ba3fb chore(maintenance): automated image update 2026-01-28 20:21:37 +00:00
359445ab43 comms: run synapse admin ensure job 2026-01-28 17:19:55 -03:00
4d1382cfc9 maintenance: track ariadne latest image 2026-01-28 14:04:58 -03:00
b66c7de5fd monitoring: avoid ariadne alert title conflict 2026-01-28 14:02:12 -03:00
3d4e5bdde1 monitoring: disable legacy cron alert 2026-01-28 13:58:28 -03:00
f37baf2447 monitoring: restart grafana to reload alerts 2026-01-28 13:53:33 -03:00
ad3d8d75c9 monitoring: reuse maint-cron uid for ariadne alert 2026-01-28 13:52:12 -03:00
4ecfdcef7c monitoring: restart grafana for ariadne alerts 2026-01-28 13:49:41 -03:00
flux-bot
63ae3e3f6f chore(comms): automated image update 2026-01-28 16:49:09 +00:00
eab2ce50b1 monitoring: alert on ariadne schedules 2026-01-28 13:47:54 -03:00
flux-bot
523db13be0 chore(maintenance): automated image update 2026-01-28 16:47:19 +00:00
6a3f8cffe1 comms: fix MAS job indentation 2026-01-28 13:25:51 -03:00
80a0f424cd comms: tolerate MAS login rate limits 2026-01-28 13:23:25 -03:00
8e9d85ccd7 comms: stop seeding atlas bots in synapse job 2026-01-28 13:18:44 -03:00
85abd589d4 comms: inject quick/smart bot creds for MAS job 2026-01-28 13:12:02 -03:00
flux-bot
bfbd707293 chore(bstein-dev-home): automated image update 2026-01-28 16:07:02 +00:00
flux-bot
526a895775 chore(bstein-dev-home): automated image update 2026-01-28 16:06:02 +00:00
38e1eba112 comms: add atlas quick/smart bots 2026-01-28 13:01:09 -03:00
flux-bot
f9e6cabe6d chore(comms): automated image update 2026-01-28 15:59:05 +00:00
36bb695c15 monitoring: fix grafana pod annotation indent 2026-01-28 12:37:42 -03:00
flux-bot
b449b65244 chore(comms): automated image update 2026-01-28 15:35:02 +00:00
1a9651914e monitoring: restart grafana after alert fix 2026-01-28 12:32:56 -03:00
flux-bot
9e5be20983 chore(comms): automated image update 2026-01-28 15:32:23 +00:00
d55bc98bbe monitoring: fix postmark alert metrics 2026-01-28 12:31:33 -03:00
flux-bot
46d677f5e7 chore(comms): automated image update 2026-01-28 15:22:49 +00:00
ef63b0f9f3 feat: add nats platform kustomization 2026-01-28 12:15:39 -03:00
111ae84255 chore: move flux sync to feature/atlasbot 2026-01-28 12:12:23 -03:00
d78a3c2550 comms: allow atlasbot to pull harbor images 2026-01-28 11:54:11 -03:00
fb89158622 atlasbot: move to service image and add nats queue infra 2026-01-28 11:52:37 -03:00
263 changed files with 10812 additions and 33771 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
!README.md
!knowledge/**/*.md
!services/comms/knowledge/**/*.md
!services/atlasbot/knowledge/**/*.md
__pycache__/
*.py[cod]
.pytest_cache

374
Jenkinsfile vendored
View File

@ -11,47 +11,9 @@ spec:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-06
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
image: python:3.12-slim
command:
- cat
tty: true
@ -61,21 +23,6 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -85,295 +32,12 @@ spec:
}
stage('Install deps') {
steps {
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Prepare local quality evidence') {
stage('Glue tests') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile jenkins --build-dir build
quality_gate_rc=$?
set -e
printf '%s\n' "${quality_gate_rc}" > build/quality-gate.rc
'''
}
}
stage('Publish test metrics') {
steps {
sh '''
set -eu
export JUNIT_GLOB='build/junit-*.xml'
export QUALITY_GATE_EXIT_CODE_PATH='build/quality-gate.rc'
export QUALITY_GATE_SUMMARY_PATH='build/quality-gate-summary.json'
python3 ci/scripts/publish_test_metrics.py
'''
}
}
stage('Enforce quality gate') {
steps {
sh '''
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
@ -381,7 +45,7 @@ PY
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "grep -m1 '^\\s*branch:' clusters/atlas/flux-system/gotk-sync.yaml | sed 's/^\\s*branch:\\s*//'"
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
@ -400,20 +64,6 @@ PY
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
@ -424,18 +74,4 @@ PY
}
}
}
post {
always {
script {
if (fileExists('build/junit-unit.xml') || fileExists('build/junit-glue.xml')) {
try {
junit allowEmptyResults: true, testResults: 'build/junit-*.xml'
} catch (Throwable err) {
echo "junit step unavailable: ${err.class.simpleName}"
}
}
}
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -1,29 +1,3 @@
# titan-iac
Flux-managed Kubernetes desired-state config for `bstein.dev`.
Canonical source URL:
- `ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
## Scope
This repo contains cluster configuration consumed by Flux:
- platform/infrastructure manifests
- service manifests and kustomizations
- operational scripts for render/reconcile workflows
This repo is **not** the Ananke application source repo.
Ananke lives in `bstein/ananke` and orchestrates host-side shutdown/startup behavior around this desired state.
## Validation workflow
```bash
kustomize build services/<app>
kubectl apply --server-side --dry-run=client -k services/<app>
flux reconcile kustomization <name> --namespace flux-system --with-source
```
## Apply model
Use Git + Flux as the source of truth.
Avoid manual in-cluster edits for durable changes.
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -10,47 +10,9 @@ spec:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-06
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
image: python:3.12-slim
command:
- cat
tty: true
@ -60,21 +22,6 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -84,295 +31,12 @@ spec:
}
stage('Install deps') {
steps {
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Prepare local quality evidence') {
stage('Glue tests') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile jenkins --build-dir build
quality_gate_rc=$?
set -e
printf '%s\n' "${quality_gate_rc}" > build/quality-gate.rc
'''
}
}
stage('Publish test metrics') {
steps {
sh '''
set -eu
export JUNIT_GLOB='build/junit-*.xml'
export QUALITY_GATE_EXIT_CODE_PATH='build/quality-gate.rc'
export QUALITY_GATE_SUMMARY_PATH='build/quality-gate-summary.json'
python3 ci/scripts/publish_test_metrics.py
'''
}
}
stage('Enforce quality gate') {
steps {
sh '''
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
@ -399,20 +63,6 @@ PY
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
@ -423,18 +73,4 @@ PY
}
}
}
post {
always {
script {
if (fileExists('build/junit-unit.xml') || fileExists('build/junit-glue.xml')) {
try {
junit allowEmptyResults: true, testResults: 'build/junit-*.xml'
} catch (Throwable err) {
echo "junit step unavailable: ${err.class.simpleName}"
}
}
}
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -1,7 +1,4 @@
pytest==8.3.4
pytest-cov==6.0.0
coverage==7.6.10
kubernetes==30.1.0
PyYAML==6.0.2
requests==2.32.3
ruff==0.8.4

View File

@ -1,352 +0,0 @@
#!/usr/bin/env python3
"""Publish titan-iac quality-gate results to Pushgateway."""
from __future__ import annotations
import json
import os
from glob import glob
from pathlib import Path
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from ci.scripts import publish_test_metrics_quality as _quality_helpers
CANONICAL_CHECKS = _quality_helpers.CANONICAL_CHECKS
_build_check_statuses = _quality_helpers._build_check_statuses
_combine_statuses = _quality_helpers._combine_statuses
_infer_sonarqube_status = _quality_helpers._infer_sonarqube_status
_infer_source_lines_over_500 = _quality_helpers._infer_source_lines_over_500
_infer_supply_chain_status = _quality_helpers._infer_supply_chain_status
_infer_workspace_coverage_percent = _quality_helpers._infer_workspace_coverage_percent
_load_optional_json = _quality_helpers._load_optional_json
_normalize_result_status = _quality_helpers._normalize_result_status
def _escape_label(value: str) -> str:
"""Escape a Prometheus label value without changing its content."""
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
def _label_str(labels: dict[str, str]) -> str:
"""Render a stable Prometheus label set from a mapping."""
parts = [f'{key}="{_escape_label(val)}"' for key, val in labels.items() if val]
return "{" + ",".join(parts) + "}" if parts else ""
def _read_text(url: str) -> str:
"""Fetch a plain-text response body from the given URL."""
with urllib.request.urlopen(url, timeout=10) as response:
return response.read().decode("utf-8")
def _post_text(url: str, payload: str) -> None:
"""PUT a plain-text payload and fail on any 4xx/5xx response."""
request = urllib.request.Request(
url,
data=payload.encode("utf-8"),
method="PUT",
headers={"Content-Type": "text/plain"},
)
with urllib.request.urlopen(request, timeout=10) as response:
if response.status >= 400:
raise RuntimeError(f"push failed with status={response.status}")
def _parse_junit(path: str) -> dict[str, int]:
"""Parse a JUnit XML file into aggregate test counters."""
if not os.path.exists(path):
return {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
tree = ET.parse(path)
root = tree.getroot()
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = [elem for elem in root if elem.tag == "testsuite"]
else:
suites = []
for suite in suites:
for key in totals:
raw_value = suite.attrib.get(key, "0")
try:
totals[key] += int(float(raw_value))
except ValueError:
totals[key] += 0
return totals
def _collect_junit_totals(pattern: str) -> dict[str, int]:
"""Sum JUnit counters across every XML file matching the pattern."""
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
for path in sorted(glob(pattern)):
parsed = _parse_junit(path)
for key in totals:
totals[key] += parsed[key]
return totals
def _collect_junit_cases(pattern: str) -> list[tuple[str, str]]:
"""Collect individual JUnit test-case statuses for flaky-test trend panels."""
cases: list[tuple[str, str]] = []
for path in sorted(glob(pattern)):
if not os.path.exists(path):
continue
root = ET.parse(path).getroot()
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = [elem for elem in root if elem.tag == "testsuite"]
else:
suites = []
for suite in suites:
for test_case in suite.findall("testcase"):
case_name = test_case.attrib.get("name", "").strip()
class_name = test_case.attrib.get("classname", "").strip()
if not case_name:
continue
full_name = f"{class_name}.{case_name}" if class_name else case_name
status = "passed"
if test_case.find("failure") is not None or test_case.find("error") is not None:
status = "failed"
elif test_case.find("skipped") is not None:
status = "skipped"
cases.append((full_name, status))
return cases
def _read_exit_code(path: str) -> int:
"""Read the quality-gate exit code, defaulting to failure if missing."""
try:
with open(path, "r", encoding="utf-8") as handle:
return int(handle.read().strip())
except (FileNotFoundError, ValueError):
return 1
def _load_summary(path: str) -> dict:
"""Load the JSON quality-gate summary, returning an empty mapping on error."""
try:
with open(path, "r", encoding="utf-8") as handle:
return json.load(handle)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def _summary_float(summary: dict, key: str) -> float:
"""Extract a float-like value from the summary, defaulting to 0.0."""
value = summary.get(key)
if isinstance(value, (int, float)):
return float(value)
return 0.0
def _summary_int(summary: dict, key: str) -> int:
"""Extract an int-like value from the summary, defaulting to 0."""
value = summary.get(key)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
return 0
def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str, str]) -> float:
"""Return the current counter value for a labeled metric if present."""
text = _read_text(f"{pushgateway_url.rstrip('/')}/metrics")
for line in text.splitlines():
if not line.startswith(metric + "{"):
continue
if any(f'{key}="{value}"' not in line for key, value in labels.items()):
continue
parts = line.split()
if len(parts) < 2:
continue
try:
return float(parts[1])
except ValueError:
return 0.0
return 0.0
def _build_payload(
suite: str,
status: str,
tests: dict[str, int],
test_cases: list[tuple[str, str]],
ok_count: int,
failed_count: int,
branch: str,
build_number: str,
jenkins_job: str,
summary: dict | None = None,
workspace_line_coverage_percent: float = 0.0,
source_lines_over_500: int = 0,
check_statuses: dict[str, str] | None = None,
) -> str:
"""Build the Pushgateway payload for the current suite run."""
passed = max(tests["tests"] - tests["failures"] - tests["errors"] - tests["skipped"], 0)
build_labels = _label_str(
{
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
)
test_case_base_labels = {
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
lines = [
"# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
f'platform_quality_gate_runs_total{{suite="{suite}",status="failed"}} {failed_count}',
"# TYPE titan_iac_quality_gate_tests_total gauge",
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="passed"}} {passed}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="failed"}} {tests["failures"]}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="error"}} {tests["errors"]}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="skipped"}} {tests["skipped"]}',
"# TYPE titan_iac_quality_gate_run_status gauge",
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="ok"}} {1 if status == "ok" else 0}',
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="failed"}} {1 if status == "failed" else 0}',
"# TYPE platform_quality_gate_build_info gauge",
f"platform_quality_gate_build_info{build_labels} 1",
"# TYPE titan_iac_quality_gate_build_info gauge",
f"titan_iac_quality_gate_build_info{build_labels} 1",
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {workspace_line_coverage_percent:.3f}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
]
if check_statuses:
lines.append("# TYPE titan_iac_quality_gate_checks_total gauge")
for check_name in CANONICAL_CHECKS:
check_status = check_statuses.get(check_name, "not_applicable")
lines.append(
f'titan_iac_quality_gate_checks_total{{suite="{suite}",check="{_escape_label(check_name)}",result="{_escape_label(check_status)}"}} 1'
)
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
if test_cases:
for test_name, test_status in test_cases:
labels = {
**test_case_base_labels,
"test": test_name,
"status": test_status,
}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
else:
labels = {**test_case_base_labels, "test": "__no_test_cases__", "status": "skipped"}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
return "\n".join(lines) + "\n"
def main() -> int:
"""Publish the quality-gate metrics and print a compact run summary."""
suite = os.getenv("SUITE_NAME", "titan_iac")
pushgateway_url = os.getenv("PUSHGATEWAY_URL", "http://platform-quality-gateway.monitoring.svc.cluster.local:9091")
job_name = os.getenv("QUALITY_GATE_JOB_NAME", "platform-quality-ci")
junit_glob = os.getenv("JUNIT_GLOB", os.getenv("JUNIT_PATH", "build/junit-*.xml"))
exit_code_path = os.getenv("QUALITY_GATE_EXIT_CODE_PATH", os.getenv("GLUE_EXIT_CODE_PATH", "build/quality-gate.rc"))
summary_path = os.getenv("QUALITY_GATE_SUMMARY_PATH", "build/quality-gate-summary.json")
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "titan-iac")
tests = _collect_junit_totals(junit_glob)
test_cases = _collect_junit_cases(junit_glob)
exit_code = _read_exit_code(exit_code_path)
status = "ok" if exit_code == 0 else "failed"
summary = _load_summary(summary_path)
workspace_line_coverage_percent = _summary_float(summary, "workspace_line_coverage_percent")
if workspace_line_coverage_percent <= 0:
workspace_line_coverage_percent = _infer_workspace_coverage_percent(summary, "build/coverage-unit.xml")
source_lines_over_500 = _summary_int(summary, "source_lines_over_500")
if source_lines_over_500 <= 0:
source_lines_over_500 = _infer_source_lines_over_500(summary)
sonarqube_report = _load_optional_json(os.getenv("QUALITY_GATE_SONARQUBE_REPORT", "build/sonarqube-quality-gate.json"))
supply_chain_report = _load_optional_json(os.getenv("QUALITY_GATE_IRONBANK_REPORT", "build/ironbank-compliance.json"))
supply_chain_required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
check_statuses = _build_check_statuses(
summary=summary,
tests=tests,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
sonarqube_report=sonarqube_report,
supply_chain_report=supply_chain_report,
supply_chain_required=supply_chain_required,
)
ok_count = int(
_fetch_existing_counter(
pushgateway_url,
"platform_quality_gate_runs_total",
{"job": job_name, "suite": suite, "status": "ok"},
)
)
failed_count = int(
_fetch_existing_counter(
pushgateway_url,
"platform_quality_gate_runs_total",
{"job": job_name, "suite": suite, "status": "failed"},
)
)
if status == "ok":
ok_count += 1
else:
failed_count += 1
payload = _build_payload(
suite=suite,
status=status,
tests=tests,
test_cases=test_cases,
ok_count=ok_count,
failed_count=failed_count,
branch=branch,
build_number=build_number,
jenkins_job=jenkins_job,
summary=summary,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
check_statuses=check_statuses,
)
push_url = f"{pushgateway_url.rstrip('/')}/metrics/job/{job_name}/suite/{suite}"
_post_text(push_url, payload)
summary = {
"suite": suite,
"status": status,
"tests_total": tests["tests"],
"tests_failed": tests["failures"],
"tests_error": tests["errors"],
"tests_skipped": tests["skipped"],
"ok_count": ok_count,
"failed_count": failed_count,
"checks_recorded": len(check_statuses),
"workspace_line_coverage_percent": workspace_line_coverage_percent,
"source_lines_over_500": source_lines_over_500,
}
print(json.dumps(summary, sort_keys=True))
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -1,200 +0,0 @@
#!/usr/bin/env python3
"""Quality/status helpers for publish_test_metrics."""
from __future__ import annotations
import json
from pathlib import Path
import xml.etree.ElementTree as ET
SUCCESS_STATUSES = {"ok", "pass", "passed", "success", "compliant"}
NOT_APPLICABLE_STATUSES = {"not_applicable", "n/a", "na", "none", "skipped"}
FAILED_STATUSES = {"failed", "fail", "error", "errors", "warn", "warning", "red"}
CANONICAL_CHECKS = [
"tests",
"coverage",
"loc",
"docs_naming",
"gate_glue",
"sonarqube",
"supply_chain",
]
def _infer_workspace_coverage_percent(summary: dict, default_xml: str) -> float:
"""Infer workspace line coverage from quality summary coverage XML metadata."""
results = summary.get("results", []) if isinstance(summary, dict) else []
coverage_xml = default_xml
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() != "coverage":
continue
candidate = str(result.get("coverage_xml") or "").strip()
if candidate:
coverage_xml = candidate
break
xml_path = Path(coverage_xml)
if not xml_path.exists():
return 0.0
try:
root = ET.parse(xml_path).getroot()
line_rate = root.attrib.get("line-rate")
if line_rate is None:
return 0.0
return float(line_rate) * 100.0
except (ET.ParseError, OSError, ValueError):
return 0.0
def _infer_source_lines_over_500(summary: dict) -> int:
"""Infer over-limit source file count from hygiene issue payloads."""
results = summary.get("results", []) if isinstance(summary, dict) else []
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() not in {"hygiene", "loc", "smell"}:
continue
issues = result.get("issues")
if not isinstance(issues, list):
continue
return sum(1 for item in issues if isinstance(item, str) and item.startswith("file exceeds"))
return 0
def _normalize_result_status(value: str | None, default: str = "failed") -> str:
"""Map arbitrary check status text into canonical check result buckets."""
if not value:
return default
normalized = value.strip().lower()
if normalized in SUCCESS_STATUSES:
return "ok"
if normalized in NOT_APPLICABLE_STATUSES:
return "not_applicable"
if normalized in FAILED_STATUSES:
return "failed"
return default
def _load_optional_json(path: str | None) -> dict:
"""Load an optional JSON report file, returning an empty object when absent."""
if not path:
return {}
candidate = Path(path)
if not candidate.exists():
return {}
try:
return json.loads(candidate.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
def _combine_statuses(statuses: list[str]) -> str:
"""Roll up many check statuses into one canonical result."""
if not statuses:
return "not_applicable"
if any(status == "failed" for status in statuses):
return "failed"
if all(status == "not_applicable" for status in statuses):
return "not_applicable"
if all(status in {"ok", "not_applicable"} for status in statuses):
return "ok"
return "failed"
def _infer_sonarqube_status(report: dict) -> str:
"""Infer canonical SonarQube check status from its JSON report payload."""
if not report:
return "not_applicable"
status = (
report.get("projectStatus", {}).get("status")
or report.get("qualityGate", {}).get("status")
or report.get("status")
)
return _normalize_result_status(str(status) if status is not None else None, default="failed")
def _infer_supply_chain_status(report: dict, required: bool) -> str:
"""Infer canonical supply-chain status from IronBank/artifact report payload."""
if not report:
return "failed" if required else "not_applicable"
compliant = report.get("compliant")
if isinstance(compliant, bool):
return "ok" if compliant else "failed"
status = report.get("status")
if status is None:
return "failed" if required else "not_applicable"
normalized = _normalize_result_status(str(status), default="failed")
if normalized == "not_applicable" and required:
return "failed"
return normalized
def _build_check_statuses(
summary: dict | None,
tests: dict[str, int],
workspace_line_coverage_percent: float,
source_lines_over_500: int,
sonarqube_report: dict,
supply_chain_report: dict,
supply_chain_required: bool,
) -> dict[str, str]:
"""Generate the canonical quality-check status map for dashboarding."""
raw_results = summary.get("results", []) if isinstance(summary, dict) else []
status_by_name: dict[str, str] = {}
for result in raw_results:
if not isinstance(result, dict):
continue
check_name = str(result.get("name") or "").strip().lower()
if not check_name:
continue
status_by_name[check_name] = _normalize_result_status(result.get("status"), default="failed")
tests_status = status_by_name.get("tests")
if not tests_status:
candidate_keys = ["unit", "integration", "e2e", "pytest", "test", "tests"]
candidates = [status_by_name[key] for key in candidate_keys if key in status_by_name]
if candidates:
tests_status = _combine_statuses(candidates)
elif tests["tests"] > 0:
tests_status = "ok" if (tests["failures"] + tests["errors"]) == 0 else "failed"
else:
tests_status = "not_applicable"
coverage_status = status_by_name.get("coverage")
if not coverage_status:
if workspace_line_coverage_percent > 0:
coverage_status = "ok" if workspace_line_coverage_percent >= 95.0 else "failed"
else:
coverage_status = "not_applicable"
loc_status = status_by_name.get("loc")
if not loc_status:
loc_status = "ok" if source_lines_over_500 == 0 else "failed"
docs_naming_status = status_by_name.get("docs_naming")
if not docs_naming_status:
candidates = [status_by_name[key] for key in ["docs", "hygiene", "smell", "lint", "naming"] if key in status_by_name]
docs_naming_status = _combine_statuses(candidates) if candidates else "not_applicable"
gate_glue_status = status_by_name.get("gate_glue")
if not gate_glue_status:
candidates = [status_by_name[key] for key in ["gate_glue", "glue", "gate"] if key in status_by_name]
gate_glue_status = _combine_statuses(candidates) if candidates else "not_applicable"
sonarqube_status = status_by_name.get("sonarqube") or _infer_sonarqube_status(sonarqube_report)
supply_chain_status = status_by_name.get("supply_chain") or _infer_supply_chain_status(
supply_chain_report,
required=supply_chain_required,
)
return {
"tests": tests_status,
"coverage": coverage_status,
"loc": loc_status,
"docs_naming": docs_naming_status,
"gate_glue": gate_glue_status,
"sonarqube": sonarqube_status,
"supply_chain": supply_chain_status,
}

View File

@ -1,173 +0,0 @@
"""Build a titan-iac supply-chain compliance report from Trivy evidence."""
from __future__ import annotations
import argparse
import datetime as dt
import json
from pathlib import Path
from typing import Any
FAIL_SEVERITIES = {"HIGH", "CRITICAL"}
def _read_json(path: Path) -> dict[str, Any]:
"""Read a JSON object from disk for use as pipeline evidence."""
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError(f"{path} must contain a JSON object")
return payload
def _parse_day(raw: str | None) -> dt.date | None:
"""Parse an ISO day while letting optional waiver dates stay optional."""
if not raw:
return None
return dt.date.fromisoformat(raw)
def _today(override: str | None = None) -> dt.date:
"""Return the policy day so tests can pin expiry behavior."""
return _parse_day(override) or dt.date.today()
def _load_waiver_pairs(path: Path | None, policy_day: dt.date) -> tuple[set[tuple[str, str]], int]:
"""Return active ``(misconfiguration id, target)`` waivers and expired count."""
if path is None or not path.exists():
return set(), 0
payload = _read_json(path)
default_expires_at = payload.get("default_expires_at")
active: set[tuple[str, str]] = set()
expired = 0
for entry in payload.get("misconfigurations", []):
if not isinstance(entry, dict):
continue
misconfiguration_id = str(entry.get("id") or "").strip()
if not misconfiguration_id:
continue
expires_at = _parse_day(str(entry.get("expires_at") or default_expires_at or ""))
targets = entry.get("targets", [])
if not isinstance(targets, list):
continue
if expires_at and expires_at < policy_day:
expired += len(targets)
continue
# Waivers are target-specific so a new unsafe manifest fails until it is
# either fixed or deliberately accepted with a fresh expiration.
for target in targets:
if isinstance(target, str) and target:
active.add((misconfiguration_id, target))
return active, expired
def _iter_failed_misconfigurations(payload: dict[str, Any]):
"""Yield failed high/critical Trivy misconfiguration records."""
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
target = str(result.get("Target") or "")
for item in result.get("Misconfigurations") or []:
if not isinstance(item, dict):
continue
if item.get("Status") != "FAIL":
continue
if str(item.get("Severity") or "").upper() not in FAIL_SEVERITIES:
continue
yield target, item
def _count_vulnerabilities(payload: dict[str, Any], severity: str) -> int:
"""Count Trivy vulnerabilities at a specific severity."""
count = 0
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
for item in result.get("Vulnerabilities") or []:
if isinstance(item, dict) and str(item.get("Severity") or "").upper() == severity:
count += 1
return count
def _count_secrets(payload: dict[str, Any]) -> int:
"""Count detected secrets in the Trivy filesystem report."""
count = 0
for result in payload.get("Results", []):
if isinstance(result, dict):
count += len(result.get("Secrets") or [])
return count
def build_report(
trivy_payload: dict[str, Any],
waiver_path: Path | None = None,
today_override: str | None = None,
) -> dict[str, Any]:
"""Build the compliance summary consumed by the quality gate."""
policy_day = _today(today_override)
active_waivers, expired_waivers = _load_waiver_pairs(waiver_path, policy_day)
open_misconfigs: list[dict[str, str]] = []
waived_misconfigs = 0
for target, item in _iter_failed_misconfigurations(trivy_payload):
misconfiguration_id = str(item.get("ID") or "")
if (misconfiguration_id, target) in active_waivers:
waived_misconfigs += 1
continue
open_misconfigs.append(
{
"id": misconfiguration_id,
"target": target,
"severity": str(item.get("Severity") or ""),
"title": str(item.get("Title") or ""),
}
)
critical = _count_vulnerabilities(trivy_payload, "CRITICAL")
high = _count_vulnerabilities(trivy_payload, "HIGH")
secrets = _count_secrets(trivy_payload)
status = "ok" if critical == 0 and secrets == 0 and not open_misconfigs else "failed"
return {
"status": status,
"compliant": status == "ok",
"category": "artifact_security",
"scan_type": "filesystem",
"scanner": "trivy",
"critical_vulnerabilities": critical,
"high_vulnerabilities": high,
"high_vulnerability_policy": "observe",
"secrets": secrets,
"high_or_critical_misconfigurations": len(open_misconfigs),
"waived_misconfigurations": waived_misconfigs,
"expired_waivers": expired_waivers,
"waiver_file": str(waiver_path) if waiver_path else "",
"open_misconfiguration_examples": open_misconfigs[:20],
}
def main(argv: list[str] | None = None) -> int:
"""CLI entrypoint used by Jenkins after the Trivy scan completes."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--trivy-json", required=True)
parser.add_argument("--waivers")
parser.add_argument("--output", required=True)
parser.add_argument("--today")
args = parser.parse_args(argv)
trivy_payload = _read_json(Path(args.trivy_json))
waiver_path = Path(args.waivers) if args.waivers else None
report = build_report(trivy_payload, waiver_path=waiver_path, today_override=args.today)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -1,7 +1,6 @@
max_success_age_hours: 48
allow_suspended:
- bstein-dev-home/vaultwarden-cred-sync
- comms/guest-name-randomizer
- comms/othrys-room-reset
- comms/pin-othrys-invite
- comms/seed-othrys-room
@ -10,7 +9,6 @@ allow_suspended:
- health/wger-user-sync
- mailu-mailserver/mailu-sync-nightly
- nextcloud/nextcloud-mail-sync
- vault/vault-oidc-config
ariadne_schedule_tasks:
- schedule.mailu_sync
- schedule.nextcloud_sync

View File

@ -1,108 +0,0 @@
"""Glue checks for Ariadne schedules exported to VictoriaMetrics."""
from __future__ import annotations
import os
from datetime import datetime, timezone
from pathlib import Path
import requests
import yaml
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
vm_url = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
response = requests.get(f"{vm_url}/api/v1/query", params={"query": promql}, timeout=10)
response.raise_for_status()
payload = response.json()
return payload.get("data", {}).get("result", [])
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_ariadne_schedule_series_exist():
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing next-run metrics for: {', '.join(missing)}"
def test_ariadne_schedule_recent_success():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-success metrics for: {', '.join(missing)}"
now = datetime.now(timezone.utc)
age_by_task = {
item.get("metric", {}).get("task"): (now - datetime.fromtimestamp(float(item["value"][1]), tz=timezone.utc)).total_seconds() / 3600
for item in series
}
too_old = [
f"{task} ({age_by_task[task]:.1f}h > {item['max_success_age_hours']}h)"
for item in tasks
if (task := item["task"]) in age_by_task and age_by_task[task] > float(item["max_success_age_hours"])
]
assert not too_old, "Ariadne schedules are stale: " + ", ".join(too_old)
def test_ariadne_schedule_last_status_present_and_boolean():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-status metrics for: {', '.join(missing)}"
invalid = []
for item in series:
task = item.get("metric", {}).get("task")
value = float(item["value"][1])
if value not in (0.0, 1.0):
invalid.append(f"{task}={value}")
assert not invalid, f"Unexpected Ariadne last-status values: {', '.join(invalid)}"

View File

@ -1,5 +1,3 @@
"""Glue checks for the metrics the quality-gate publishes."""
from __future__ import annotations
import os
@ -25,63 +23,26 @@ def _query(promql: str) -> list[dict]:
return payload.get("data", {}).get("result", [])
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def test_glue_metrics_present():
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
assert series, "No glue cronjob label series found"
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_glue_metrics_success_join():
query = (
"kube_cronjob_status_last_successful_time "
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def test_ariadne_schedule_metrics_present():
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
def test_ariadne_schedule_success_and_status_metrics_present():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
success = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
status = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
success_tasks = {item.get("metric", {}).get("task") for item in success}
status_tasks = {item.get("metric", {}).get("task") for item in status}
expected = {item["task"] for item in tasks}
missing_success = sorted(expected - success_tasks)
missing_status = sorted(expected - status_tasks)
assert not missing_success, f"Missing Ariadne success metrics for: {', '.join(missing_success)}"
assert not missing_status, f"Missing Ariadne status metrics for: {', '.join(missing_status)}"

View File

@ -1,401 +0,0 @@
{
"version": 1,
"generated_from": "Jenkins titan-iac build 225 Trivy filesystem scan",
"default_expires_at": "2026-05-22",
"ticket": "atlas-quality-wave-k8s-hardening",
"default_reason": "Existing Kubernetes manifest hardening baseline accepted only for the first quality-gate rollout; fix or renew explicitly before expiry.",
"misconfigurations": [
{
"id": "DS-0002",
"targets": [
"dockerfiles/Dockerfile.ananke-node-helper"
]
},
{
"id": "KSV-0009",
"targets": [
"services/mailu/vip-controller.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml"
]
},
{
"id": "KSV-0010",
"targets": [
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0014",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/guest-register-deployment.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/actual-budget-deployment.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/deployment.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud-mail-sync/cronjob.yaml",
"services/nextcloud/collabora.yaml",
"services/nextcloud/cronjob.yaml",
"services/nextcloud/deployment.yaml",
"services/nextcloud/maintenance-cronjob.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/planka/deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vault/statefulset.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0017",
"targets": [
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0041",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"infrastructure/traefik/clusterrole.yaml",
"services/bstein-dev-home/rbac.yaml",
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/comms/mas-secrets-ensure-rbac.yaml",
"services/maintenance/soteria-rbac.yaml"
]
},
{
"id": "KSV-0047",
"targets": [
"services/monitoring/rbac.yaml"
]
},
{
"id": "KSV-0053",
"targets": [
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/ariadne-rbac.yaml"
]
},
{
"id": "KSV-0056",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/disable-k3s-traefik-rbac.yaml",
"services/maintenance/k3s-traefik-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0114",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0118",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/coredns-deployment.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud/collabora.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/sui-metrics/overlays/atlas/patch-node-selector.yaml",
"services/typhon/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0121",
"targets": [
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml"
]
}
]
}

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/atlasbot

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: atlasbot
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: flux-system
path: ./services/atlasbot
targetNamespace: ai
timeout: 2m
dependsOn:
- name: ai-llm

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: main
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: main
branch: feature/atlasbot
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/applications/comms/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: comms
namespace: comms
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(comms): automated image update"
push:
branch: feature/atlasbot
update:
strategy: Setters
path: services/comms

View File

@ -13,8 +13,4 @@ spec:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: longhorn
- name: vault
- name: postgres
wait: true

View File

@ -16,6 +16,3 @@ spec:
wait: false
dependsOn:
- name: core
- name: longhorn
- name: vault
- name: postgres

View File

@ -25,4 +25,3 @@ spec:
name: jenkins
namespace: jenkins
wait: false
timeout: 20m

View File

@ -12,8 +12,4 @@ spec:
name: flux-system
path: ./services/keycloak
targetNamespace: sso
dependsOn:
- name: longhorn
- name: vault
- name: postgres
timeout: 2m

View File

@ -6,6 +6,9 @@ resources:
- vault/kustomization.yaml
- vaultwarden/kustomization.yaml
- comms/kustomization.yaml
- comms/image-automation.yaml
- atlasbot/kustomization.yaml
- atlasbot/image-automation.yaml
- crypto/kustomization.yaml
- monerod/kustomization.yaml
- pegasus/kustomization.yaml
@ -21,12 +24,10 @@ resources:
- sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml
- quality/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml
- jenkins/kustomization.yaml
- ai-llm/kustomization.yaml
- typhon/kustomization.yaml
- nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml
- outline/kustomization.yaml

View File

@ -16,4 +16,4 @@ spec:
dependsOn:
- name: crypto
wait: true
timeout: 15m
timeout: 5m

View File

@ -1,35 +0,0 @@
# clusters/atlas/flux-system/applications/quality/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: quality
namespace: flux-system
spec:
interval: 10m
path: ./services/quality
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: quality
dependsOn:
- name: traefik
- name: cert-manager
- name: keycloak
- name: vault
- name: postgres
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: sonarqube
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: sonarqube-exporter
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: oauth2-proxy-sonarqube
namespace: quality
wait: false
timeout: 20m

View File

@ -1,29 +0,0 @@
# clusters/atlas/flux-system/applications/typhon/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: typhon
namespace: flux-system
spec:
interval: 10m
path: ./services/typhon
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: climate
dependsOn:
- name: vault
- name: vault-csi
- name: monitoring
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: typhon
namespace: climate
- apiVersion: v1
kind: Service
name: typhon
namespace: climate
wait: false
timeout: 20m

View File

@ -15,5 +15,4 @@ spec:
prune: true
wait: true
dependsOn:
- name: longhorn
- name: helm

View File

@ -17,4 +17,3 @@ spec:
- name: crypto
- name: monerod
wait: true
timeout: 30m

View File

@ -9,7 +9,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: main
branch: feature/atlasbot
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -16,5 +16,6 @@ resources:
- longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml
- nats/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: main
branch: feature/atlasbot
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: main
branch: feature/atlasbot
update:
strategy: Setters
path: services/maintenance

View File

@ -0,0 +1,21 @@
# clusters/atlas/flux-system/platform/nats/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: nats
namespace: flux-system
spec:
interval: 10m
path: ./infrastructure/nats
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: nats
healthChecks:
- apiVersion: apps/v1
kind: StatefulSet
name: nats
namespace: nats
wait: true

View File

@ -14,7 +14,6 @@ spec:
name: flux-system
targetNamespace: postgres
dependsOn:
- name: longhorn
- name: vault
- name: vault-csi
healthChecks:

View File

@ -1,12 +0,0 @@
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
util-linux \
zstd \
&& rm -rf /var/lib/apt/lists/*
CMD ["/bin/sh"]

View File

@ -2,8 +2,4 @@ FROM python:3.11-slim
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN pip install --no-cache-dir requests psycopg2-binary \
&& groupadd --system guest-tools \
&& useradd --system --uid 65532 --gid guest-tools --home-dir /nonexistent --shell /usr/sbin/nologin guest-tools
USER guest-tools
RUN pip install --no-cache-dir requests psycopg2-binary

View File

@ -1,8 +1,16 @@
# Use the mirrored Harbor artifact so CI does not depend on Docker Hub egress.
FROM registry.bstein.dev/streaming/data-prepper@sha256:32ac6ad42e0f12da08bebee307e290b17d127b30def9b06eeaffbcbbc5033e83
FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
ENV DATA_PREPPER_PATH=/usr/share/data-prepper
RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
&& mkdir -p /var/log/data-prepper
COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
USER 10001
WORKDIR /usr/share/data-prepper
CMD ["bin/data-prepper"]

View File

@ -1,13 +1,10 @@
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S livekit-token \
&& adduser -S -D -H -u 65532 -G livekit-token livekit-token
RUN apk add --no-cache ca-certificates
COPY --from=base /lk-jwt-service /lk-jwt-service
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER livekit-token
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/lk-jwt-service"]

View File

@ -29,12 +29,10 @@ FROM ${DEBIAN_IMAGE}
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends ca-certificates; \
update-ca-certificates; rm -rf /var/lib/apt/lists/*; \
groupadd --system p2pool; \
useradd --system --uid 65532 --gid p2pool --home-dir /nonexistent --shell /usr/sbin/nologin p2pool
update-ca-certificates; rm -rf /var/lib/apt/lists/*
COPY --from=fetch /out/p2pool /usr/local/bin/p2pool
RUN /usr/local/bin/p2pool --version || true
EXPOSE 3333
USER p2pool
ENTRYPOINT ["/usr/local/bin/p2pool"]

View File

@ -26,12 +26,9 @@ RUN set -eux; \
curl -fsSL "$URL" -o /opt/monero/monero.tar.bz2; \
tar -xjf /opt/monero/monero.tar.bz2 -C /opt/monero --strip-components=1; \
install -m 0755 /opt/monero/monero-wallet-rpc /usr/local/bin/monero-wallet-rpc; \
rm -f /opt/monero/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero
rm -f /opt/monero/monero.tar.bz2
ENV PATH="/usr/local/bin:/usr/bin:/bin"
RUN /usr/local/bin/monero-wallet-rpc --version || true
EXPOSE 18083
USER monero

View File

@ -23,14 +23,10 @@ RUN set -eux; \
mkdir -p /opt/monero; \
tar -xjf /tmp/monero.tar.bz2 -C /opt/monero --strip-components=1; \
rm -f /tmp/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero; \
mkdir -p /data; \
chown monero:monero /data; \
chmod 0770 /data
ENV LD_LIBRARY_PATH=/opt/monero:/opt/monero/lib \
PATH="/opt/monero:${PATH}"
USER monero
CMD ["/opt/monero/monerod", "--version"]

View File

@ -1,13 +1,10 @@
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S oauth2-proxy \
&& adduser -S -D -H -u 65532 -G oauth2-proxy oauth2-proxy
RUN apk add --no-cache ca-certificates
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER oauth2-proxy
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/bin/oauth2-proxy"]

View File

@ -1,13 +1,10 @@
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S pegasus \
&& adduser -S -D -H -u 65532 -G pegasus pegasus
RUN apk add --no-cache ca-certificates
COPY --from=base /pegasus /pegasus
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER pegasus
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/pegasus"]

View File

@ -1,48 +0,0 @@
# dockerfiles/Dockerfile.quality-tools
FROM debian:bookworm-slim
ARG SONAR_SCANNER_VERSION=8.0.1.6346
ARG TRIVY_VERSION=0.70.0
ENV TRIVY_CACHE_DIR=/opt/trivy-cache
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
git \
jq \
unzip \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd --system quality-tools \
&& useradd --system --uid 65532 --gid quality-tools --home-dir /nonexistent --shell /usr/sbin/nologin quality-tools
RUN set -eux; \
scanner_zip="sonar-scanner-cli-${SONAR_SCANNER_VERSION}-linux-aarch64.zip"; \
base_url="https://binaries.sonarsource.com/Distribution/sonar-scanner-cli"; \
curl -fsSL "${base_url}/${scanner_zip}" -o "/tmp/${scanner_zip}"; \
curl -fsSL "${base_url}/${scanner_zip}.sha256" -o "/tmp/${scanner_zip}.sha256"; \
printf '%s %s\n' "$(cat "/tmp/${scanner_zip}.sha256")" "/tmp/${scanner_zip}" | sha256sum -c -; \
unzip -q "/tmp/${scanner_zip}" -d /opt; \
ln -s "/opt/sonar-scanner-${SONAR_SCANNER_VERSION}-linux-aarch64/bin/sonar-scanner" /usr/local/bin/sonar-scanner; \
rm -f "/tmp/${scanner_zip}" "/tmp/${scanner_zip}.sha256"
RUN set -eux; \
trivy_tgz="trivy_${TRIVY_VERSION}_Linux-ARM64.tar.gz"; \
curl -fsSL "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/${trivy_tgz}" -o "/tmp/${trivy_tgz}"; \
tar -C /usr/local/bin -xzf "/tmp/${trivy_tgz}" trivy; \
rm -f "/tmp/${trivy_tgz}"; \
trivy --version; \
sonar-scanner -v
RUN set -eux; \
mkdir -p "${TRIVY_CACHE_DIR}"; \
trivy image --download-db-only --cache-dir "${TRIVY_CACHE_DIR}"; \
chmod -R a+rX "${TRIVY_CACHE_DIR}"; \
mkdir -p /workspace; \
chown quality-tools:quality-tools /workspace
WORKDIR /workspace
USER quality-tools

View File

@ -0,0 +1,3 @@
FROM python:3.11-slim
RUN pip install --no-cache-dir psycopg2-binary bcrypt

View File

@ -27,42 +27,10 @@ spec:
timeout: 10m
values:
installCRDs: true
extraArgs:
- --acme-http01-solver-nameservers=1.1.1.1:53,8.8.8.8:53
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -76,36 +44,6 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -119,36 +57,6 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

View File

@ -26,7 +26,7 @@ spec:
spec:
containers:
- name: coredns
image: registry.k8s.io/coredns/coredns:v1.12.1
image: registry.bstein.dev/infra/coredns:1.12.1
imagePullPolicy: IfNotPresent
args:
- -conf

View File

@ -6,6 +6,7 @@ resources:
- ../modules/profiles/atlas-ha
- coredns-custom.yaml
- coredns-deployment.yaml
- longhorn-node-taints.yaml
- ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,40 @@
# infrastructure/core/longhorn-node-taints.yaml
apiVersion: v1
kind: Node
metadata:
name: titan-13
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-15
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-17
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule
---
apiVersion: v1
kind: Node
metadata:
name: titan-19
spec:
taints:
- key: longhorn
value: "true"
effect: PreferNoSchedule

View File

@ -0,0 +1,10 @@
# infrastructure/longhorn/core/backup-target.yaml
apiVersion: longhorn.io/v1beta2
kind: BackupTarget
metadata:
name: default
namespace: longhorn-system
spec:
backupTargetURL: "s3://atlas-soteria@us-west-004/"
credentialSecret: longhorn-backup-b2
pollInterval: 5m0s

View File

@ -6,6 +6,39 @@ metadata:
namespace: longhorn-system
spec:
interval: 30m
postRenderers:
- kustomize:
patches:
- target:
kind: Service
name: longhorn-conversion-webhook
namespace: longhorn-system
patch: |
- op: add
path: /spec/publishNotReadyAddresses
value: true
- target:
kind: Service
name: longhorn-admission-webhook
namespace: longhorn-system
patch: |
- op: add
path: /spec/publishNotReadyAddresses
value: true
- target:
kind: DaemonSet
name: longhorn-manager
namespace: longhorn-system
patch: |
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/path
value: /v1/healthz
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/port
value: 9500
- op: replace
path: /spec/template/spec/containers/0/readinessProbe/httpGet/scheme
value: HTTP
chart:
spec:
chart: longhorn
@ -26,9 +59,6 @@ spec:
cleanupOnFail: true
timeout: 15m
values:
global:
nodeSelector:
longhorn-host: "true"
service:
ui:
type: NodePort
@ -37,7 +67,7 @@ spec:
createSecret: false
registrySecret: longhorn-registry
image:
pullPolicy: Always
pullPolicy: IfNotPresent
longhorn:
engine:
repository: registry.bstein.dev/infra/longhorn-engine
@ -80,13 +110,4 @@ spec:
repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always
longhornManager:
nodeSelector:
longhorn-host: "true"
longhornDriver:
nodeSelector:
longhorn-host: "true"
longhornUI:
nodeSelector:
longhorn-host: "true"
systemManagedPodsImagePullPolicy: IfNotPresent

View File

@ -6,17 +6,14 @@ resources:
- vault-serviceaccount.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- backup-target.yaml
- helmrelease.yaml
- longhorn-settings-ensure-job.yaml
- longhorn-disk-tags-ensure-job.yaml
configMapGenerator:
- name: longhorn-settings-ensure-script
files:
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
- name: longhorn-disk-tags-ensure-script
files:
- longhorn_disk_tags_ensure.py=scripts/longhorn_disk_tags_ensure.py
generatorOptions:
disableNameSuffixHash: true

View File

@ -1,36 +0,0 @@
# infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-disk-tags-ensure-1
namespace: longhorn-system
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
volumes:
- name: longhorn-disk-tags-ensure-script
configMap:
name: longhorn-disk-tags-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: python:3.12.9-alpine3.20
command: ["python", "/scripts/longhorn_disk_tags_ensure.py"]
volumeMounts:
- name: longhorn-disk-tags-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -2,11 +2,10 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-7
name: longhorn-settings-ensure-4
namespace: longhorn-system
spec:
backoffLimit: 0
activeDeadlineSeconds: 240
ttlSecondsAfterFinished: 3600
template:
spec:

View File

@ -1,100 +0,0 @@
#!/usr/bin/env python3
"""Reconcile Longhorn disk tags for the Titan longhorn storage classes.
The astreae/asteria storageclasses select Longhorn disks by tag. The current
nodes already have the right disk paths, but the tag fields can drift to empty
after node recovery. This job patches the live Longhorn Node CRs back to the
expected tags so PVC provisioning keeps working.
"""
from __future__ import annotations
import json
import os
import ssl
import urllib.request
LONGHORN_NS = "longhorn-system"
LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes"
DESIRED_TAGS = {
"/mnt/astreae": "astreae",
"/mnt/asteria": "asteria",
}
def api_base() -> str:
host = os.environ.get("KUBERNETES_SERVICE_HOST")
port = os.environ.get("KUBERNETES_SERVICE_PORT", "443")
if not host:
raise SystemExit("missing KUBERNETES_SERVICE_HOST")
return f"https://{host}:{port}"
def token() -> str:
path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
with open(path, "r", encoding="utf-8") as fh:
return fh.read().strip()
def ca_context() -> ssl.SSLContext:
cafile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
return ssl.create_default_context(cafile=cafile)
def request_json(method: str, path: str, body: dict | None = None) -> dict:
req = urllib.request.Request(
f"{api_base()}{path}",
method=method,
headers={
"Authorization": f"Bearer {token()}",
"Content-Type": "application/merge-patch+json",
"Accept": "application/json",
},
data=None if body is None else json.dumps(body).encode("utf-8"),
)
with urllib.request.urlopen(req, context=ca_context(), timeout=20) as resp:
payload = resp.read()
return json.loads(payload) if payload else {}
def list_nodes() -> list[dict]:
data = request_json("GET", LONGHORN_API.format(namespace=LONGHORN_NS))
return data.get("items", [])
def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None:
body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def main() -> int:
changed = 0
skipped = 0
for node in list_nodes():
name = node.get("metadata", {}).get("name", "")
spec_disks = node.get("spec", {}).get("disks", {}) or {}
for disk_name, disk in spec_disks.items():
disk_path = disk.get("path")
desired_tag = DESIRED_TAGS.get(disk_path)
if not desired_tag:
continue
current_tags = disk.get("tags") or []
if current_tags == [desired_tag]:
skipped += 1
continue
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}")
patch_disk_tags(name, disk_name, desired_tag)
changed += 1
print(f"done: changed={changed} skipped={skipped}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -4,12 +4,11 @@ set -eu
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
curl_opts="-fsS --connect-timeout 3 --max-time 15"
wait_for_api() {
attempts=30
while [ "${attempts}" -gt 0 ]; do
if curl ${curl_opts} "${api_base}" >/dev/null 2>&1; then
if curl -fsS "${api_base}" >/dev/null 2>&1; then
return 0
fi
attempts=$((attempts - 1))
@ -23,14 +22,14 @@ update_setting() {
name="$1"
value="$2"
current="$(curl ${curl_opts} "${api_base}/${name}" || true)"
current="$(curl -fsS "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} already set."
return 0
fi
echo "Setting ${name} -> ${value}"
curl ${curl_opts} -X PUT \
curl -fsS -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}" >/dev/null
@ -41,7 +40,3 @@ update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
# Keep storage-heavy nodes from getting hammered by rebuild storms and skew.
update_setting replica-auto-balance "best-effort"
update_setting concurrent-replica-rebuild-per-node-limit "2"
update_setting node-down-pod-deletion-policy "delete-both-statefulset-and-deployment-pod"

View File

@ -13,13 +13,13 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "longhorn-backup-b2__AWS_ACCESS_KEY_ID"
- objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn-backup-b2__AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn-backup-b2__AWS_ENDPOINTS"
- objectName: "longhorn_backup__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects:
@ -31,9 +31,9 @@ spec:
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn-backup-b2__AWS_ACCESS_KEY_ID
- objectName: longhorn_backup__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn-backup-b2__AWS_SECRET_ACCESS_KEY
- objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn-backup-b2__AWS_ENDPOINTS
- objectName: longhorn_backup__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -26,16 +26,6 @@ spec:
- key: hardware
operator: In
values: ["rpi5", "rpi4"]
- weight: 90
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
containers:
- name: sync
image: alpine:3.20

View File

@ -78,7 +78,6 @@ spec:
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nats-config
namespace: nats
labels:
app: nats
component: config
annotations:
description: "NATS JetStream configuration"
data:
nats.conf: |
jetstream {
store_dir: /data
max_mem_store: 128MB
max_file_store: 1GB
}

View File

@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- configmap.yaml
- service.yaml
- statefulset.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: nats

View File

@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
selector:
app: nats
ports:
- name: client
port: 4222
targetPort: 4222
- name: monitoring
port: 8222
targetPort: 8222

View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: nats
namespace: nats
labels:
app: nats
spec:
serviceName: nats
replicas: 1
selector:
matchLabels:
app: nats
template:
metadata:
labels:
app: nats
spec:
containers:
- name: nats
image: nats:2.10.18
args:
- "-c"
- "/etc/nats/nats.conf"
ports:
- name: client
containerPort: 4222
- name: monitoring
containerPort: 8222
volumeMounts:
- name: config
mountPath: /etc/nats
- name: data
mountPath: /data
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: nats-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi

View File

@ -25,7 +25,6 @@ spec:
serviceAccountName: postgres-vault
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -36,17 +35,7 @@ spec:
values: ["true"]
- key: hardware
operator: In
values: ["rpi5"]
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-06"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-05", "titan-07", "titan-08", "titan-11"]
values: ["rpi4", "rpi5"]
containers:
- name: postgres
image: postgres:15

View File

@ -2,7 +2,7 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
rules:
- apiGroups:
- ""

View File

@ -2,12 +2,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
subjects:
- kind: ServiceAccount
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
namespace: traefik

View File

@ -70,42 +70,10 @@ items:
dnsPolicy: ClusterFirst
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: atlas-traefik-ingress-controller
serviceAccountName: atlas-traefik-ingress-controller
serviceAccount: traefik-ingress-controller
serviceAccountName: traefik-ingress-controller
terminationGracePeriodSeconds: 30
kind: List
metadata: {}

View File

@ -1,9 +0,0 @@
# infrastructure/traefik/ingressclass.yaml
apiVersion: networking.k8s.io/v1
kind: IngressClass
metadata:
name: traefik
annotations:
ingressclass.kubernetes.io/is-default-class: "true"
spec:
controller: traefik.io/ingress-controller

View File

@ -6,7 +6,6 @@ metadata:
namespace: flux-system
resources:
- crds.yaml
- ingressclass.yaml
- deployment.yaml
- serviceaccount.yaml
- clusterrole.yaml

View File

@ -2,5 +2,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
namespace: traefik

View File

@ -41,12 +41,3 @@ spec:
failurePolicy: Ignore
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]

View File

@ -1,152 +0,0 @@
Atlas Cluster Power Recovery (Graceful Shutdown/Startup)
Purpose
- Provide a safe operator flow for planned power events and cold-boot recovery.
- Avoid the Flux/Gitea bootstrap deadlock by using a local bootstrap fallback path.
- Break the Harbor self-hosting deadlock by seeding Harbor runtime images from a control-host bundle.
- Refuse bootstrap when UPS charge is too low, and fall back to fast shutdown if a second outage hits mid-recovery.
Bootstrapping risk to remember
- Flux source is Git over SSH to `scm.bstein.dev` (Gitea).
- Gitea itself is a Flux-managed workload and depends on storage + database.
- Harbor is also critical, but it is not part of the first recovery stage because Harbor serves its own runtime images.
- On cold boot, if Flux cannot fetch source before Gitea is up, reconciliation can stall.
- Recovery path: bring control plane and workers up, then locally apply minimal platform stack (`core -> helm -> longhorn -> metallb -> traefik -> vault-csi -> vault-injector -> vault -> postgres -> gitea`), then seed Harbor images onto the Harbor node from a control-host bundle, then resume/reconcile Flux. Harbor is a later recovery stage after storage, Vault, Postgres, and Gitea are back.
Script
- `scripts/cluster_power_recovery.sh`
- `scripts/cluster_power_console.sh`
- Modes:
- `prepare`
- `shutdown`
- `harbor-seed`
- `startup`
- `status`
- Default is dry-run. Add `--execute` to actually perform actions.
Dry-run examples
- Shutdown preview:
- `scripts/cluster_power_recovery.sh shutdown --skip-etcd-snapshot --skip-drain`
- Startup preview:
- `scripts/cluster_power_recovery.sh startup`
- Harbor seed preview:
- `scripts/cluster_power_recovery.sh harbor-seed`
Execute examples
- Prepare helper image on every node:
- `scripts/cluster_power_recovery.sh prepare --execute`
- Seed Harbor runtime images onto `titan-05` from the control-host bundle:
- `scripts/cluster_power_recovery.sh harbor-seed --execute`
- Planned shutdown:
- `scripts/cluster_power_recovery.sh shutdown --execute`
- Planned startup (canonical branch):
- `scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main`
Manual remote console examples
- Canonical operator hosts:
- `titan-db`
- `tethys` (`titan-24`)
- Both hosts now have:
- `~/ananke-tools/cluster_power_recovery.sh`
- `~/ananke-tools/cluster_power_console.sh`
- `~/ananke-tools/bootstrap/recovery-config.env`
- `~/ananke-tools/bootstrap/harbor-bootstrap-images.txt`
- `~/ananke-tools/kubeconfig`
- `~/ananke-cluster-power`
- `~/bin/ananke-cluster-power`
- `~/ananke-repo/{infrastructure,services,scripts}`
- Both hosts also keep the Harbor bootstrap bundle at:
- `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
- Remote usage:
- `ssh titan-db`
- `~/ananke-cluster-power status`
- `~/ananke-cluster-power prepare --execute`
- `~/ananke-cluster-power shutdown --execute`
- `~/ananke-cluster-power startup --execute --force-flux-branch main`
- `ssh tethys`
- `~/ananke-cluster-power status`
- `~/ananke-cluster-power prepare --execute`
- `~/ananke-cluster-power shutdown --execute`
- `~/ananke-cluster-power startup --execute --force-flux-branch main`
Useful options
- `--shutdown-mode host-poweroff|cluster-only`
- `--expected-flux-branch main`
- `--expected-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
- `--force-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
- `--force-flux-branch main`
- `--allow-flux-source-mutation` (required with `--force-flux-url`; breakglass only)
- `--skip-local-bootstrap` (not recommended for cold-start recovery)
- `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred)
- `--skip-harbor-seed` (skip bundle import if Harbor images are already cached on the target node)
- `--skip-helper-prewarm`
- `--min-startup-battery 35`
- `--ups-host pyrphoros@localhost`
- `--require-ups-battery`
- `--drain-timeout 180`
- `--emergency-drain-timeout 45`
- `--flux-ready-timeout 1200`
- `--startup-checklist-timeout 900`
- `--startup-stability-window 180`
- `--startup-stability-timeout 900`
- `--recovery-state-file ~/.local/share/ananke/cluster_power_recovery.state`
- `--harbor-bundle-file ~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
Controlled drill checklist (recommended)
- Operator host: use `titan-db` as canonical control host for the drill.
- On-site coordination:
- Have on-site operator ready before shutdown starts.
- Confirm they will manually power cluster nodes back on after shutdown completes.
- Confirm who will announce "all nodes powered on" to resume startup.
- Preflight on `titan-db`:
- `mkdir -p ~/ananke-logs`
- `~/ananke-cluster-power status` and verify:
- `ups_host=pyrphoros@localhost`
- `ups_battery` is numeric
- `flux_source_ready=True`
- Warm helper image just before shutdown:
- `~/ananke-cluster-power prepare --execute`
- Run in a persistent shell and capture logs:
- `tmux new -s ananke-drill`
- `script -q -a ~/ananke-logs/ananke-drill-$(date +%Y%m%d-%H%M%S).log`
- Execute controlled shutdown with telemetry enforcement:
- `~/ananke-cluster-power shutdown --execute --require-ups-battery`
- After on-site power-on confirmation, execute startup:
- `~/ananke-cluster-power startup --execute --force-flux-branch main --require-ups-battery`
- Post-check:
- `~/ananke-cluster-power status`
- Verify critical services (`longhorn`, `vault`, `postgres`, `gitea`, `harbor`, `pegasus`) and no widespread pull/crash failures.
Operational notes
- The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn.
- Shutdown behavior is explicit:
- `host-poweroff` schedules host poweroff after service stop.
- `cluster-only` stops `k3s`/`k3s-agent` without powering hosts off.
- Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted.
- Startup fails fast if Flux source URL/branch drift from expected values (unless branch override is explicitly requested with `--force-flux-branch`).
- Flux desired-state source remains `titan-iac.git`. Ananke orchestrates runtime recovery and should not be used as the normal Flux source repo.
- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first using the repo snapshot under `~/ananke-repo`.
- Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer.
- Harbor is reconciled after the first critical stateful services.
- Harbor bootstrap is now designed around a control-host bundle:
- Build the Harbor bundle locally with `scripts/build_harbor_bootstrap_bundle.sh`.
- Stage it on the operator host at `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`.
- Use `harbor-seed --execute` or a full `startup --execute` to stream/import that bundle onto `titan-05`.
- The Harbor bundle remains arm64-only because Harbor is pinned to arm64 nodes. The node-helper image is multi-arch because Ananke uses it across both arm64 and amd64 nodes during prepare/shutdown operations.
- Ananke uses a temporary privileged helper pod for host-side operations. The helper image is prewarmed with `prepare --execute` so later shutdown/startup steps do not stall on image pulls.
- The script persists outage state in `~/.local/share/ananke/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap.
- Startup completion is strict now:
- all non-optional Flux kustomizations must be `Ready=True`
- external service checklist must pass (defaults include Gitea, Grafana, Harbor)
- generated ingress reachability checks must pass (default accepted codes: `200,301,302,307,308,401,403,404`)
- stability soak must pass with no crashloop/pull-failure churn
- If Flux hits immutable one-off Job drift during reconcile, Ananke now attempts self-heal by pruning failed Flux-managed Jobs and retrying reconcile.
- In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster.
- Dry-run mode no longer mutates outage recovery state.
- `harbor-seed --execute` was validated by:
- prewarming the helper image across all nodes
- streaming the Harbor bootstrap bundle to `titan-05`
- importing Harbor runtime images into host `containerd`
- successfully running a Harbor-backed canary pod (`harbor-canary-ok`)
- After bootstrap, Flux resources are resumed and reconciled.
- Keep this runbook aligned with `clusters/atlas/flux-system/gotk-sync.yaml`.

View File

@ -1,3 +0,0 @@
[pytest]
addopts = -ra
norecursedirs = .git .venv .venv-ci __pycache__ tmp

View File

@ -1,9 +0,0 @@
# Harbor cold-start bootstrap images.
registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64
registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64
registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64
registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64

View File

@ -1,36 +0,0 @@
CANONICAL_CONTROL_HOST="titan-db"
DEFAULT_FLUX_BRANCH="main"
EXPECTED_FLUX_URL="ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
SHUTDOWN_MODE="host-poweroff"
STATE_SUBDIR=".local/share/ananke"
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
HARBOR_TARGET_NODE=""
HARBOR_CANARY_NODE=""
HARBOR_HOST_LABEL_KEY="ananke.bstein.dev/harbor-bootstrap"
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
NODE_HELPER_NAMESPACE="maintenance"
NODE_HELPER_SERVICE_ACCOUNT="default"
REGISTRY_PULL_SECRET="harbor-regcred"
BUNDLE_HTTP_PORT="8877"
UPS_HOST="pyrphoros@localhost"
UPS_BATTERY_KEY="battery.charge"
FLUX_READY_TIMEOUT_SECONDS="1200"
FLUX_READY_POLL_SECONDS="10"
STARTUP_CHECKLIST_TIMEOUT_SECONDS="900"
STARTUP_CHECKLIST_POLL_SECONDS="10"
STARTUP_WORKLOAD_TIMEOUT_SECONDS="900"
STARTUP_WORKLOAD_POLL_SECONDS="10"
STARTUP_STABILITY_WINDOW_SECONDS="180"
STARTUP_STABILITY_TIMEOUT_SECONDS="900"
STARTUP_STABILITY_POLL_SECONDS="10"
STARTUP_OPTIONAL_KUSTOMIZATIONS=""
STARTUP_IGNORE_PODS_REGEX=""
STARTUP_IGNORE_WORKLOADS_REGEX=""
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="^(kube-system|kube-public|kube-node-lease|flux-system)$"
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="10"
STARTUP_INCLUDE_INGRESS_CHECKS="1"
STARTUP_INGRESS_ALLOWED_STATUSES="200,301,302,307,308,401,403,404"
STARTUP_IGNORE_INGRESS_HOSTS_REGEX=""
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="10"
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|200,401|||'

View File

@ -1,56 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
DOCKER_CONFIG_PATH=""
PLATFORMS="linux/amd64,linux/arm64"
BUILDER_NAME="ananke-node-helper-builder"
while [[ $# -gt 0 ]]; do
case "$1" in
--image)
IMAGE="${2:?missing image}"
shift 2
;;
--docker-config)
DOCKER_CONFIG_PATH="${2:?missing docker config path}"
shift 2
;;
--platforms)
PLATFORMS="${2:?missing platforms}"
shift 2
;;
--builder)
BUILDER_NAME="${2:?missing builder}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_ananke_node_helper.sh [--image <image>] [--docker-config <path>] [--platforms <csv>] [--builder <name>]
USAGE
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then
export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}"
fi
if ! docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use >/dev/null
else
docker buildx use "${BUILDER_NAME}" >/dev/null
fi
docker buildx inspect --bootstrap >/dev/null
docker buildx build \
--platform "${PLATFORMS}" \
-f dockerfiles/Dockerfile.ananke-node-helper \
-t "${IMAGE}" \
--push \
.

View File

@ -1,58 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGES_FILE="scripts/bootstrap/harbor-bootstrap-images.txt"
BUNDLE_FILE="artifacts/harbor-bootstrap-v2.14.1-arm64.tar.zst"
DOCKER_CONFIG_PATH=""
PLATFORM="linux/arm64"
while [[ $# -gt 0 ]]; do
case "$1" in
--images-file)
IMAGES_FILE="${2:?missing images file}"
shift 2
;;
--bundle-file)
BUNDLE_FILE="${2:?missing bundle file}"
shift 2
;;
--docker-config)
DOCKER_CONFIG_PATH="${2:?missing docker config path}"
shift 2
;;
--platform)
PLATFORM="${2:?missing platform}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>]
USAGE
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then
export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}"
fi
mapfile -t IMAGES < <(grep -v '^[[:space:]]*#' "${IMAGES_FILE}" | sed '/^[[:space:]]*$/d')
if [[ ${#IMAGES[@]} -eq 0 ]]; then
echo "No images found in ${IMAGES_FILE}" >&2
exit 1
fi
mkdir -p "$(dirname "${BUNDLE_FILE}")"
for image in "${IMAGES[@]}"; do
echo "Pulling ${image}" >&2
docker pull --platform "${PLATFORM}" "${image}" >/dev/null
done
docker save "${IMAGES[@]}" | zstd -T0 -19 -o "${BUNDLE_FILE}"
echo "Wrote ${BUNDLE_FILE}" >&2

View File

@ -1,87 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
usage() {
cat <<'USAGE'
Usage:
scripts/cluster_power_console.sh [--repo-dir <path>] [--delegate-host <host>] <shutdown|startup> [recovery-script-options...]
Purpose:
Friendly manual entrypoint for running Ananke from a remote console.
Canonical control host is titan-db by default so bundle/state handling stays in one place.
Defaults:
--repo-dir $HOME/Development/ananke (fallback: $HOME/Development/titan-iac)
--delegate-host titan-db
Examples:
scripts/cluster_power_console.sh shutdown --execute
scripts/cluster_power_console.sh startup --execute --force-flux-branch main
scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute
USAGE
}
if [[ -d "${HOME}/Development/ananke" ]]; then
REPO_DIR="${HOME}/Development/ananke"
else
REPO_DIR="${HOME}/Development/titan-iac"
fi
DELEGATE_HOST="titan-db"
REMOTE_REPO_DIR="${ANANKE_REMOTE_REPO_DIR:-}"
while [[ $# -gt 0 ]]; do
case "$1" in
--repo-dir)
REPO_DIR="${2:-}"
shift 2
;;
--delegate-host)
DELEGATE_HOST="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
break
;;
esac
done
if [[ $# -lt 1 ]]; then
usage
exit 1
fi
SIBLING_SCRIPT="${SCRIPT_DIR}/cluster_power_recovery.sh"
REPO_SCRIPT="${REPO_DIR}/scripts/cluster_power_recovery.sh"
LOCAL_SCRIPT=""
if [[ -x "${SIBLING_SCRIPT}" ]]; then
LOCAL_SCRIPT="${SIBLING_SCRIPT}"
elif [[ -x "${REPO_SCRIPT}" ]]; then
LOCAL_SCRIPT="${REPO_SCRIPT}"
fi
if [[ -n "${LOCAL_SCRIPT}" ]] && command -v kubectl >/dev/null 2>&1; then
exec "${LOCAL_SCRIPT}" "$@"
fi
if [[ -z "${DELEGATE_HOST}" ]]; then
echo "cluster-power-console: no usable local recovery script found and no delegate host configured" >&2
exit 1
fi
quoted_args="$(printf '%q ' "$@")"
quoted_repo_dir="$(printf '%q' "${REPO_DIR}")"
remote_cmd=""
if [[ -n "${REMOTE_REPO_DIR}" ]]; then
remote_cmd+="ANANKE_REPO_DIR=$(printf '%q' "${REMOTE_REPO_DIR}") "
fi
remote_cmd+="if [ -x ~/ananke-tools/cluster_power_recovery.sh ]; then ~/ananke-tools/cluster_power_recovery.sh ${quoted_args}; elif [ -x ${quoted_repo_dir}/scripts/cluster_power_recovery.sh ]; then ${quoted_repo_dir}/scripts/cluster_power_recovery.sh ${quoted_args}; else echo 'cluster-power-console: remote recovery script not found' >&2; exit 1; fi"
exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" "${remote_cmd}"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -539,9 +539,9 @@ def main() -> int:
help="Write generated files (otherwise just print a summary).",
)
ap.add_argument(
"--sync-comms",
"--sync-atlasbot",
action="store_true",
help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
)
args = ap.parse_args()
@ -632,10 +632,10 @@ def main() -> int:
print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
if args.sync_comms:
comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
_sync_tree(out_dir, comms_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
if args.sync_atlasbot:
atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
_sync_tree(out_dir, atlasbot_dir)
print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
return 0

View File

@ -1,163 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<USAGE
Usage: scripts/node_recover.sh <node-name> [options]
Options:
--yes Skip confirmation prompt
--skip-drain Do not cordon/drain; only capture recovery artifacts
--delete-node Delete Node object after drain (for hard-dead node replacement)
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
-h, --help Show this help
USAGE
}
if ! command -v kubectl >/dev/null 2>&1; then
echo "kubectl is required" >&2
exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
echo "jq is required" >&2
exit 1
fi
if [ "$#" -lt 1 ]; then
usage
exit 1
fi
node=""
assume_yes="false"
skip_drain="false"
delete_node="false"
out_dir="./artifacts/node-recovery"
while [ "$#" -gt 0 ]; do
case "$1" in
--yes)
assume_yes="true"
shift
;;
--skip-drain)
skip_drain="true"
shift
;;
--delete-node)
delete_node="true"
shift
;;
--out-dir)
out_dir="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
-*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
*)
if [ -z "${node}" ]; then
node="$1"
else
echo "Unexpected argument: $1" >&2
usage
exit 1
fi
shift
;;
esac
done
if [ -z "${node}" ]; then
echo "Node name is required" >&2
usage
exit 1
fi
if ! kubectl get node "${node}" >/dev/null 2>&1; then
echo "Node ${node} not found in cluster API" >&2
exit 1
fi
if [ "${assume_yes}" != "true" ]; then
echo "About to prepare recovery workflow for node: ${node}"
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
read -r -p "Type the node name to continue: " confirm
if [ "${confirm}" != "${node}" ]; then
echo "Confirmation did not match node name; aborting."
exit 1
fi
fi
timestamp="$(date +%Y%m%d-%H%M%S)"
artifacts_dir="${out_dir}/${node}-${timestamp}"
mkdir -p "${artifacts_dir}"
echo "Saving node and workload artifacts to ${artifacts_dir}"
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
jq -r '
.metadata.labels
| to_entries[]
| select(
.key != "kubernetes.io/hostname"
and .key != "beta.kubernetes.io/hostname"
and .key != "node.kubernetes.io/instance-type"
and .key != "beta.kubernetes.io/instance-type"
and (.key | startswith("kubernetes.io/") | not)
and (.key | startswith("beta.kubernetes.io/") | not)
and (.key | startswith("node.kubernetes.io/") | not)
)
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
jq -r '
(.spec.taints // [])[]
| "kubectl taint node <replacement-node> "
+ .key
+ (if .value then "=" + .value else "" end)
+ ":"
+ .effect
+ " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
if [ "${skip_drain}" != "true" ]; then
echo "Cordoning ${node}"
kubectl cordon "${node}" || true
echo "Draining ${node}"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
echo "Standard drain failed; retrying with --force"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
echo "Force drain failed; retrying with --disable-eviction"
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
fi
fi
fi
if [ "${delete_node}" = "true" ]; then
echo "Deleting node object ${node}"
kubectl delete node "${node}" || true
fi
cat <<NEXT
Recovery prep complete for ${node}.
Artifacts: ${artifacts_dir}
Next steps:
1) Reimage/reprovision replacement host.
2) Rejoin k3s and wait for node Ready.
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
5) Validate pods and uncordon replacement when ready.
NEXT

View File

@ -4,21 +4,13 @@ import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("scripts.dashboards_render_atlas", path)
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def flatten_panels(panels):
flat = []
for panel in panels:
flat.append(panel)
flat.extend(panel.get("panels", []))
return flat
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
@ -50,18 +42,6 @@ def test_node_filter_and_expr_helpers():
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_overview_availability_panel_uses_recorded_365d_rollup():
mod = load_module()
dashboard = mod.build_overview()
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
assert panel["title"] == "Atlas Availability (365d)"
assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])'
assert panel["targets"][0]["instant"] is True
assert "precomputed" in panel["description"]
assert "scrape gaps are ignored" in panel["description"]
def test_render_configmap_writes(tmp_path):
mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash"
@ -76,93 +56,3 @@ def test_render_configmap_writes(tmp_path):
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content
def test_testing_suite_variable_uses_canonical_values_only():
mod = load_module()
variable = mod.testing_suite_variable()
canonical_matcher = "|".join(mod.PLATFORM_TEST_SUITE_NAMES)
legacy_names = {"bstein-home", "data-prepper", "titan-iac", "pegasus-health"}
assert variable["allValue"] == canonical_matcher
assert not any(alias in variable["query"] for alias in legacy_names)
assert not any(alias in variable["allValue"] for alias in legacy_names)
assert [option["value"] for option in variable["options"]] == mod.PLATFORM_TEST_SUITE_NAMES
def test_jobs_dashboard_separates_current_gate_health_from_reliability():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])}
assert "Current Gate Health by Suite" in panels_by_title
assert "Run Reliability by Suite (24h)" in panels_by_title
assert "Run Reliability History by Suite" in panels_by_title
assert "Failures by Suite (24h)" not in panels_by_title
assert "Success Rate by Suite (24h)" not in panels_by_title
current_gate_expr = panels_by_title["Current Gate Health by Suite"]["targets"][0]["expr"]
assert 'check)' in current_gate_expr
assert 'result=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
reliability_panel = panels_by_title["Run Reliability by Suite (24h)"]
reliability_expr = reliability_panel["targets"][0]["expr"]
assert "platform_quality_gate_runs_total" in reliability_expr
assert "> 0" in reliability_expr
assert "- 1" in reliability_expr
assert reliability_panel["fieldConfig"]["defaults"]["mappings"] == [
{"type": "value", "options": {"-1": {"text": "no runs"}}}
]
def test_jobs_dashboard_bar_gauges_use_solid_threshold_colors():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels = flatten_panels(dashboard["panels"])
bar_gauges = [panel for panel in panels if panel["type"] == "bargauge"]
assert bar_gauges
assert all(panel["options"]["displayMode"] == "basic" for panel in bar_gauges)
assert all(
panel["fieldConfig"]["defaults"]["color"]["mode"] == "thresholds"
for panel in bar_gauges
)
reliability_panel = next(
panel for panel in panels if panel["title"] == "Run Reliability by Suite (24h)"
)
threshold_steps = reliability_panel["fieldConfig"]["defaults"]["thresholds"]["steps"]
assert {"color": "yellow", "value": 93} in threshold_steps
assert {"color": "blue", "value": 100} in threshold_steps
def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels = dashboard["panels"]
rows = [panel for panel in panels if panel["type"] == "row"]
visible_query_panels = [panel for panel in panels if panel["type"] != "row"]
nested_panels_by_title = {
child["title"]: child
for row in rows
for child in row.get("panels", [])
}
assert len(panels) == 16
assert len(visible_query_panels) == 11
assert sum(len(panel.get("targets", [])) for panel in visible_query_panels) == 11
assert [row["title"] for row in rows] == [
"Reliability And Run History",
"Failure Trends By Check",
"Success Trends By Check",
"Test Drilldowns And Problem Tests",
"Telemetry Completeness, SonarQube, And Branches",
]
assert all(row["collapsed"] for row in rows)
assert "Failure Trend: Coverage" in nested_panels_by_title
assert "Success Trend: Supply Chain" in nested_panels_by_title
assert "Selected Test Pass Rate History" in nested_panels_by_title
assert "Missing Coverage Metrics by Suite" in nested_panels_by_title
assert "SonarQube API Up" in nested_panels_by_title

View File

@ -1,7 +1,5 @@
import importlib.util
import pathlib
import sys
import types
import pytest
@ -22,26 +20,6 @@ def load_sync_module(monkeypatch):
}
for k, v in env.items():
monkeypatch.setenv(k, v)
fake_psycopg2 = types.ModuleType("psycopg2")
fake_psycopg2.Error = Exception
fake_psycopg2.connect = lambda **kwargs: None
fake_psycopg2_extras = types.ModuleType("psycopg2.extras")
fake_psycopg2_extras.RealDictCursor = object
fake_passlib = types.ModuleType("passlib")
fake_passlib_hash = types.ModuleType("passlib.hash")
class _FakeBcryptSha256:
@staticmethod
def hash(password):
return f"stub:{password}"
fake_passlib_hash.bcrypt_sha256 = _FakeBcryptSha256
fake_passlib.hash = fake_passlib_hash
monkeypatch.setitem(sys.modules, "psycopg2", fake_psycopg2)
monkeypatch.setitem(sys.modules, "psycopg2.extras", fake_psycopg2_extras)
monkeypatch.setitem(sys.modules, "passlib", fake_passlib)
monkeypatch.setitem(sys.modules, "passlib.hash", fake_passlib_hash)
module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
@ -138,100 +116,6 @@ def test_kc_get_users_paginates(monkeypatch):
assert sync.SESSION.calls == 1
def test_kc_get_users_fetches_second_page_after_full_batch(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
self.first_params = []
def get(self, *_, **kwargs):
self.calls += 1
self.first_params.append(kwargs["params"]["first"])
if self.calls == 1:
return _FakeResponse([{"id": f"u{i}"} for i in range(200)])
return _FakeResponse([{"id": "last"}])
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert len(users) == 201
assert sync.SESSION.first_params == [0, 200]
def test_get_kc_token_posts_client_credentials(monkeypatch):
sync = load_sync_module(monkeypatch)
calls = []
class _TokenSession:
def post(self, url, data, timeout):
calls.append((url, data, timeout))
return _FakeResponse({"access_token": "tok"})
sync.SESSION = _TokenSession()
assert sync.get_kc_token() == "tok"
assert calls[0][1]["grant_type"] == "client_credentials"
def test_retry_request_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _flaky():
attempts.append(1)
if len(attempts) == 1:
raise sync.requests.RequestException("temporary")
return "ok"
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_request("request", _flaky, attempts=2) == "ok"
assert sleeps == [2]
def test_retry_request_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.requests.RequestException):
sync.retry_request(
"request",
lambda: (_ for _ in ()).throw(sync.requests.RequestException("nope")),
attempts=1,
)
def test_retry_db_connect_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _connect(**kwargs):
attempts.append(kwargs)
if len(attempts) == 1:
raise sync.psycopg2.Error("not yet")
return "conn"
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_db_connect(attempts=2) == "conn"
assert sleeps == [2]
def test_retry_db_connect_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.psycopg2, "connect", lambda **kwargs: (_ for _ in ()).throw(sync.psycopg2.Error("down")))
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.psycopg2.Error):
sync.retry_db_connect(attempts=1)
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
@ -260,87 +144,6 @@ def test_ensure_mailu_user_upserts(monkeypatch):
assert captured["password"] != "pw"
def test_attribute_and_email_helpers(monkeypatch):
sync = load_sync_module(monkeypatch)
assert sync.get_attribute_value({"x": ["first", "second"]}, "x") == "first"
assert sync.get_attribute_value({"x": []}, "x") is None
assert sync.get_attribute_value({"x": "value"}, "x") == "value"
assert sync.mailu_enabled({"mailu_email": ["legacy@example.com"]}) is True
assert sync.mailu_enabled({"mailu_enabled": ["off"]}) is False
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@example.com"}, {}) == "user@example.com"
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@other.com"}, {}) == "fallback@example.com"
def test_safe_update_payload_filters_fields(monkeypatch):
sync = load_sync_module(monkeypatch)
payload = sync._safe_update_payload(
{
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD", 7],
"attributes": "not-a-dict",
"ignored": "value",
}
)
assert payload == {
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD"],
"attributes": {},
}
def test_ensure_system_mailboxes_handles_configurations(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
ensured = []
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", ["postmaster@example.com", "abuse"])
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "")
sync.ensure_system_mailboxes(object())
assert "MAILU_SYSTEM_PASSWORD is missing" in capsys.readouterr().out
def _ensure(cursor, email, password, display_name):
ensured.append((email, password, display_name))
if email == "abuse":
raise RuntimeError("boom")
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "pw")
monkeypatch.setattr(sync, "ensure_mailu_user", _ensure)
sync.ensure_system_mailboxes(object())
out = capsys.readouterr().out
assert ensured == [
("postmaster@example.com", "pw", "postmaster"),
("abuse", "pw", "abuse"),
]
assert "Ensured system mailbox for postmaster@example.com" in out
assert "Failed to ensure system mailbox abuse" in out
def test_main_exits_without_users_or_system_mailboxes(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", [])
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: [])
sync.main()
assert "No users found; exiting." in capsys.readouterr().out
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")

View File

@ -1,134 +0,0 @@
import importlib.util
import io
import pathlib
import types
def load_listener_module(monkeypatch):
monkeypatch.setenv("MAILU_SYNC_WAIT_TIMEOUT_SEC", "0")
module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
/ "mailu"
/ "scripts"
/ "mailu_sync_listener.py"
)
spec = importlib.util.spec_from_file_location("mailu_sync_listener_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def _handler_for(listener, body):
handler = listener.Handler.__new__(listener.Handler)
raw = body if isinstance(body, bytes) else body.encode()
handler.headers = {"Content-Length": str(len(raw))}
handler.rfile = io.BytesIO(raw)
handler.responses = []
handler.headers_ended = 0
handler.send_response = lambda code: handler.responses.append(code)
handler.end_headers = lambda: setattr(handler, "headers_ended", handler.headers_ended + 1)
return handler
def test_listener_run_sync_blocking_updates_state(monkeypatch):
listener = load_listener_module(monkeypatch)
monkeypatch.setattr(listener, "time", lambda: 42.0)
monkeypatch.setattr(
listener.subprocess,
"run",
lambda command, check: types.SimpleNamespace(returncode=3),
)
assert listener._run_sync_blocking() == 3
assert listener.last_rc == 3
assert listener.last_run == 42.0
assert listener.sync_done.is_set()
listener.sync_running = True
assert listener._run_sync_blocking() == 0
def test_listener_trigger_sync_async_honors_running_and_debounce(monkeypatch):
listener = load_listener_module(monkeypatch)
starts = []
class _Thread:
def __init__(self, target, daemon):
self.target = target
self.daemon = daemon
def start(self):
starts.append((self.target, self.daemon))
monkeypatch.setattr(listener.threading, "Thread", _Thread)
monkeypatch.setattr(listener, "time", lambda: 100.0)
listener.sync_running = True
assert listener._trigger_sync_async() is False
listener.sync_running = False
listener.last_run = 95.0
assert listener._trigger_sync_async() is False
assert listener._trigger_sync_async(force=True) is True
assert starts and starts[0][1] is True
def test_listener_post_rejects_invalid_json(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = _handler_for(listener, b"{not-json")
handler.do_POST()
assert handler.responses == [400]
assert handler.headers_ended == 1
def test_listener_post_triggers_async_without_wait(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
handler = _handler_for(listener, '{"force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [202]
def test_listener_post_wait_returns_success_or_failure(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
listener.sync_running = False
listener.last_rc = 0
handler = _handler_for(listener, '{"wait": true, "force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [200]
listener.last_rc = 2
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [500]
def test_listener_post_wait_keeps_running_request_successful(monkeypatch):
listener = load_listener_module(monkeypatch)
listener.sync_running = True
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [200]
def test_listener_log_message_is_quiet(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = listener.Handler.__new__(listener.Handler)
assert handler.log_message("ignored %s", "value") is None

View File

@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
MODE="${1:-dry-run}"
if [[ "$MODE" != "dry-run" && "$MODE" != "active" ]]; then
echo "usage: $0 [dry-run|active]" >&2
exit 2
fi
EXPECTED_DRY_RUN="true"
PROM_MODE="dry_run"
if [[ "$MODE" == "active" ]]; then
EXPECTED_DRY_RUN="false"
PROM_MODE="delete"
fi
KUSTOMIZATION="${KUSTOMIZATION:-maintenance}"
NAMESPACE="${NAMESPACE:-maintenance}"
DEPLOYMENT="${DEPLOYMENT:-ariadne}"
LOCAL_METRICS_PORT="${LOCAL_METRICS_PORT:-18080}"
for cmd in flux kubectl curl grep awk; do
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "missing required command: $cmd" >&2
exit 2
fi
done
echo "[1/5] reconcile Flux kustomization: ${KUSTOMIZATION}"
flux reconcile kustomization "$KUSTOMIZATION" --namespace flux-system --with-source
echo "[2/5] wait for deployment rollout"
kubectl -n "$NAMESPACE" rollout status "deployment/$DEPLOYMENT" --timeout=5m
echo "[3/5] verify ariadne env wiring"
ENV_DUMP="$(kubectl -n "$NAMESPACE" get deployment "$DEPLOYMENT" -o jsonpath='{range .spec.template.spec.containers[0].env[*]}{.name}={.value}{"\n"}{end}')"
echo "$ENV_DUMP" | grep -F "ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP=45 */6 * * *"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_NAMESPACE=jenkins"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_PVC_PREFIX=pvc-workspace-"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS=24"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_DRY_RUN=${EXPECTED_DRY_RUN}"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN=20"
echo "[4/5] scrape /metrics and confirm cleanup metrics are exported"
PF_LOG="$(mktemp)"
METRICS_FILE="$(mktemp)"
cleanup() {
if [[ -n "${PF_PID:-}" ]]; then
kill "$PF_PID" >/dev/null 2>&1 || true
wait "$PF_PID" 2>/dev/null || true
fi
rm -f "$PF_LOG" "$METRICS_FILE"
}
trap cleanup EXIT
kubectl -n "$NAMESPACE" port-forward "deployment/$DEPLOYMENT" "${LOCAL_METRICS_PORT}:8080" >"$PF_LOG" 2>&1 &
PF_PID=$!
sleep 2
curl -fsS "http://127.0.0.1:${LOCAL_METRICS_PORT}/metrics" >"$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_objects_total" "$METRICS_FILE"
echo "[5/5] show recent cleanup signal"
if grep -q "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"; then
grep "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE" | grep "mode=\"${PROM_MODE}\"" || true
else
echo "No run counter sample yet for mode=${PROM_MODE}; wait for schedule window and re-run." >&2
fi
echo "Recent cleanup logs (if any):"
kubectl -n "$NAMESPACE" logs "deployment/$DEPLOYMENT" --tail=500 | grep -i "jenkins workspace cleanup" | tail -n 20 || true
echo "verification complete for mode=${MODE}"

View File

@ -5,7 +5,7 @@ metadata:
name: ollama
namespace: ai
spec:
replicas: 0
replicas: 1
revisionHistoryLimit: 2
strategy:
type: RollingUpdate
@ -21,7 +21,7 @@ spec:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21)
ai.bstein.dev/gpu: GPU pool (titan-22/24)
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
spec:
affinity:
@ -32,13 +32,13 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models-asteria
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d

View File

@ -2,12 +2,12 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-models-asteria
name: ollama-models
namespace: ai
spec:
accessModes:
- ReadWriteMany
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: asteria
storageClassName: astreae

View File

@ -3,7 +3,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: atlasbot
namespace: comms
namespace: ai
labels:
app: atlasbot
spec:
@ -16,9 +16,9 @@ spec:
labels:
app: atlasbot
annotations:
checksum/atlasbot-configmap: manual-atlasbot-103
checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/role: "ai"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
vault.hashicorp.com/agent-inject-template-turn-secret: |
{{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -67,17 +67,17 @@ spec:
hardware: rpi5
containers:
- name: atlasbot
image: python:3.11-slim
image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
command: ["/bin/sh","-c"]
args:
- |
. /vault/scripts/comms_vault_env.sh
exec python /app/bot.py
. /vault/scripts/atlasbot_vault_env.sh
exec python -m atlasbot.main
env:
- name: MATRIX_BASE
value: http://othrys-synapse-matrix-synapse:8008
value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
- name: AUTH_BASE
value: http://matrix-authentication-service:8080
value: http://matrix-authentication-service.comms.svc.cluster.local:8080
- name: KB_DIR
value: /kb
- name: VM_URL
@ -93,7 +93,7 @@ spec:
- name: BOT_USER_GENIUS
value: atlas-genius
- name: BOT_MENTIONS
value: atlas-quick,atlas-smart,atlas-genius,atlas_quick,atlas_smart,atlas_genius
value: atlas-quick,atlas-smart,atlas-genius
- name: OLLAMA_URL
value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL
@ -104,26 +104,50 @@ spec:
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC
value: "600"
- name: OLLAMA_RETRIES
value: "0"
- name: ATLASBOT_THINKING_INTERVAL_SEC
value: "30"
- name: ATLASBOT_QUICK_TIME_BUDGET_SEC
value: "15"
- name: ATLASBOT_SMART_TIME_BUDGET_SEC
value: "45"
- name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
value: "180"
- name: ATLASBOT_OLLAMA_RETRIES
value: "0"
- name: ATLASBOT_THINKING_INTERVAL_SEC
value: "30"
- name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30"
- name: ATLASBOT_HTTP_PORT
value: "8090"
- name: ATLASBOT_STATE_DB
value: /data/atlasbot_state.db
- name: ATLASBOT_QUEUE_ENABLED
value: "false"
- name: ATLASBOT_DEBUG_PIPELINE
value: "true"
- name: ATLASBOT_NATS_URL
value: nats://nats.nats.svc.cluster.local:4222
- name: ATLASBOT_NATS_STREAM
value: atlasbot
- name: ATLASBOT_NATS_SUBJECT
value: atlasbot.requests
- name: ATLASBOT_FAST_MAX_ANGLES
value: "2"
- name: ATLASBOT_SMART_MAX_ANGLES
value: "5"
- name: ATLASBOT_FAST_MAX_CANDIDATES
value: "2"
- name: ATLASBOT_SMART_MAX_CANDIDATES
value: "6"
- name: ATLASBOT_FAST_LLM_CALLS_MAX
value: "8"
- name: ATLASBOT_SMART_LLM_CALLS_MAX
value: "24"
- name: ATLASBOT_GENIUS_LLM_CALLS_MAX
value: "72"
ports:
- name: http
containerPort: 8090
@ -135,19 +159,15 @@ spec:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: code
mountPath: /app/bot.py
subPath: bot.py
- name: kb
mountPath: /kb
readOnly: true
- name: vault-scripts
mountPath: /vault/scripts
readOnly: true
- name: atlasbot-state
mountPath: /data
volumes:
- name: code
configMap:
name: atlasbot
- name: kb
configMap:
name: atlas-kb
@ -166,5 +186,7 @@ spec:
path: diagrams/atlas-http.mmd
- name: vault-scripts
configMap:
name: comms-vault-env
name: atlasbot-vault-env
defaultMode: 0555
- name: atlasbot-state
emptyDir: {}

View File

@ -3,7 +3,9 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: atlasbot
namespace: comms
namespace: ai
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
@ -43,5 +45,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: atlasbot
namespace: comms
namespace: ai

View File

@ -2,7 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: atlasbot
namespace: comms
namespace: ai
labels:
app: atlasbot
spec:

View File

@ -0,0 +1,26 @@
# services/atlasbot/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: atlasbot
namespace: ai
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/atlasbot
commit:
author:
name: flux-bot
email: ops@bstein.dev
messageTemplate: "chore(atlasbot): automated image update"
push:
branch: feature/atlasbot
update:
path: services/atlasbot
strategy: Setters

View File

@ -0,0 +1,23 @@
# services/comms/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImageRepository
metadata:
name: atlasbot
namespace: ai
spec:
image: registry.bstein.dev/bstein/atlasbot
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: atlasbot
namespace: ai
spec:
imageRepositoryRef:
name: atlasbot
policy:
semver:
range: ">=0.1.0-0"

View File

@ -0,0 +1,22 @@
Atlas Knowledge Base (KB)
This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
Regeneration
- Update manifests/docs, then regenerate generated artifacts:
- `python scripts/knowledge_render_atlas.py --write`
Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.

View File

@ -0,0 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,234 @@
flowchart LR
host_auth_bstein_dev["auth.bstein.dev"]
svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
host_auth_bstein_dev --> svc_sso_oauth2_proxy
wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
host_bstein_dev["bstein.dev"]
svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
host_bstein_dev --> svc_comms_matrix_wellknown
wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
wl_comms_element_call["comms/element-call (Deployment)"]
svc_comms_element_call --> wl_comms_element_call
host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
host_ci_bstein_dev["ci.bstein.dev"]
svc_jenkins_jenkins["jenkins/jenkins (Service)"]
host_ci_bstein_dev --> svc_jenkins_jenkins
wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
svc_jenkins_jenkins --> wl_jenkins_jenkins
host_cloud_bstein_dev["cloud.bstein.dev"]
svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
svc_comms_livekit_token_service --> wl_comms_livekit_token_service
svc_comms_livekit["comms/livekit (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
host_mail_bstein_dev["mail.bstein.dev"]
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
svc_nextcloud_collabora --> wl_nextcloud_collabora
host_pegasus_bstein_dev["pegasus.bstein.dev"]
svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
host_pegasus_bstein_dev --> svc_jellyfin_pegasus
wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
svc_jellyfin_pegasus --> wl_jellyfin_pegasus
host_scm_bstein_dev["scm.bstein.dev"]
svc_gitea_gitea["gitea/gitea (Service)"]
host_scm_bstein_dev --> svc_gitea_gitea
wl_gitea_gitea["gitea/gitea (Deployment)"]
svc_gitea_gitea --> wl_gitea_gitea
host_secret_bstein_dev["secret.bstein.dev"]
svc_vault_vault["vault/vault (Service)"]
host_secret_bstein_dev --> svc_vault_vault
wl_vault_vault["vault/vault (StatefulSet)"]
svc_vault_vault --> wl_vault_vault
host_sso_bstein_dev["sso.bstein.dev"]
svc_sso_keycloak["sso/keycloak (Service)"]
host_sso_bstein_dev --> svc_sso_keycloak
wl_sso_keycloak["sso/keycloak (Deployment)"]
svc_sso_keycloak --> wl_sso_keycloak
host_stream_bstein_dev["stream.bstein.dev"]
svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
subgraph bstein_dev_home[bstein-dev-home]
svc_bstein_dev_home_bstein_dev_home_frontend
wl_bstein_dev_home_bstein_dev_home_frontend
svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend
svc_bstein_dev_home_chat_ai_gateway
wl_bstein_dev_home_chat_ai_gateway
end
subgraph comms[comms]
svc_comms_matrix_wellknown
wl_comms_matrix_wellknown
svc_comms_element_call
wl_comms_element_call
svc_comms_livekit_token_service
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
svc_jellyfin_jellyfin
wl_jellyfin_jellyfin
end
subgraph jenkins[jenkins]
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
end
subgraph mailu_mailserver[mailu-mailserver]
svc_mailu_mailserver_mailu_front
end
subgraph nextcloud[nextcloud]
svc_nextcloud_nextcloud
wl_nextcloud_nextcloud
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy
svc_sso_keycloak
wl_sso_keycloak
end
subgraph vault[vault]
svc_vault_vault
wl_vault_vault
end
subgraph vaultwarden[vaultwarden]
svc_vaultwarden_vaultwarden_service
wl_vaultwarden_vaultwarden
end

View File

@ -0,0 +1,29 @@
# services/atlasbot/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ai
resources:
- atlasbot-deployment.yaml
- atlasbot-service.yaml
- atlasbot-rbac.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- image.yaml
- image-automation.yaml
images:
- name: registry.bstein.dev/bstein/atlasbot
newTag: 0.1.2-106 # {"$imagepolicy": "ai:atlasbot:tag"}
configMapGenerator:
- name: atlasbot-vault-env
files:
- atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
options:
disableNameSuffixHash: true
- name: atlas-kb
files:
- INDEX.md=knowledge/INDEX.md
- atlas.json=knowledge/catalog/atlas.json
- atlas-summary.json=knowledge/catalog/atlas-summary.json
- metrics.json=knowledge/catalog/metrics.json
- runbooks.json=knowledge/catalog/runbooks.json
- atlas-http.mmd=knowledge/diagrams/atlas-http.mmd

View File

@ -0,0 +1,44 @@
#!/usr/bin/env sh
set -eu
vault_dir="/vault/secrets"
read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_SMART}" ]; then
export BOT_PASS_SMART="${BOT_PASS}"
fi
if [ -z "${BOT_PASS_GENIUS}" ]; then
export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"
export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
export PGPASSWORD="$(read_secret synapse-db-pass)"
export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"

View File

@ -1,14 +1,14 @@
# services/typhon/secretproviderclass.yaml
# services/atlasbot/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: typhon-vault
namespace: climate
name: atlasbot-vault
namespace: ai
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "typhon"
roleName: "ai"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"

Some files were not shown because too many files have changed in this diff Show More