Compare commits

..

694 Commits

Author SHA1 Message Date
cc51eb6d1e Merge pull request 'feature/ariadne' (#11) from feature/ariadne into main
Reviewed-on: #11
2026-01-28 14:05:38 +00:00
aa608fbf0f atlasbot: improve fact parsing and fallback answers 2026-01-28 11:02:10 -03:00
436e56c5de atlasbot: favor factual fallback in fast mode 2026-01-28 04:10:31 -03:00
dda943ce16 atlasbot: expand full-pack triggers and strip inline confidence 2026-01-28 04:06:24 -03:00
043d1cbab3 atlasbot: clean fact labels and non-cluster confidence 2026-01-28 04:00:13 -03:00
da94cc6f97 atlasbot: improve fast fallback and usage filtering 2026-01-28 03:56:26 -03:00
7c0a25a0eb atlasbot: expand fast context for quantitative prompts 2026-01-28 03:51:37 -03:00
7194cad0a8 atlasbot: refine fast fact selection and prompts 2026-01-28 03:46:06 -03:00
eb567fda06 atlasbot: fix fallback fact parsing 2026-01-28 03:35:02 -03:00
a9d74a066f atlasbot: prefer fact fallback for quantitative prompts 2026-01-28 03:32:17 -03:00
19b52ac5e3 atlasbot: add fact-pack fallback for fast 2026-01-28 03:29:21 -03:00
885e7b6489 comms: use 14b model for atlasbot quick 2026-01-28 03:23:54 -03:00
8316e5dd15 atlasbot: fix tag detection for workload queries 2026-01-28 03:20:28 -03:00
be82109d4e atlasbot: enforce fast answer body 2026-01-28 03:17:46 -03:00
971848558a atlasbot: prioritize fact selection for quick answers 2026-01-28 03:14:12 -03:00
980c2cf1cc atlasbot: enrich fact pack summaries 2026-01-28 03:09:34 -03:00
08ac598181 atlasbot: streamline quick answers 2026-01-28 02:53:43 -03:00
349a46ceab comms: tune atlasbot quick model 2026-01-28 02:43:24 -03:00
666dcb3faa atlasbot: rework reasoning pipeline 2026-01-28 02:21:42 -03:00
769d3f41bf comms: roll atlasbot config 2026-01-28 01:58:23 -03:00
62e0a565f5 atlasbot: tighten fast facts 2026-01-28 01:58:07 -03:00
2a2179a138 comms: roll atlasbot config 2026-01-28 01:52:40 -03:00
c1e94d56c8 atlasbot: simplify fast path 2026-01-28 01:52:23 -03:00
244578cc01 chore: organize one-off jobs 2026-01-28 01:48:32 -03:00
0146e3dc95 maintenance: suspend ariadne migrate job 2026-01-28 01:35:34 -03:00
48c379dc88 comms: roll atlasbot config 2026-01-28 01:07:26 -03:00
6001876409 atlasbot: add per-hardware extremes 2026-01-28 01:07:13 -03:00
2fe3d5b932 atlasbot: roll config 2026-01-28 01:02:32 -03:00
474c472b1d atlasbot: enrich fact pack and selection 2026-01-28 01:02:14 -03:00
6578a8b08a atlasbot: roll config 2026-01-28 00:24:13 -03:00
44c22e3d00 atlasbot: improve multi-pass synthesis 2026-01-28 00:22:32 -03:00
2af817b9db atlasbot: speed up fast mode 2026-01-27 23:57:36 -03:00
2d90005076 atlasbot: fix insight scoring 2026-01-27 23:49:28 -03:00
a10050e4c7 atlasbot: overhaul reasoning pipeline 2026-01-27 23:45:08 -03:00
b34f2abefd monitoring: fix grafana alert exec state 2026-01-27 23:34:11 -03:00
9409c037c9 monitoring: restart grafana for alerting reload 2026-01-27 23:29:46 -03:00
3a2bb1bac9 chore: bump atlasbot checksum 2026-01-27 23:24:46 -03:00
f43acaa554 atlasbot: fix bottom ops and pod queries 2026-01-27 23:24:12 -03:00
c5a7eece35 monitoring: tune cpu and maintenance alerts 2026-01-27 23:23:42 -03:00
19d10ce585 chore: bump atlasbot checksum 2026-01-27 23:17:23 -03:00
7b1c891e70 atlasbot: improve metric detection and counts 2026-01-27 23:16:53 -03:00
67ca0d451d chore: bump atlasbot checksum 2026-01-27 23:02:22 -03:00
4b468b0f97 atlasbot: fix word boundary detection 2026-01-27 23:01:51 -03:00
380aae3b2c chore: bump atlasbot checksum 2026-01-27 22:55:24 -03:00
b9b25565a2 atlasbot: tighten scoring and readiness logic 2026-01-27 22:55:00 -03:00
24b0ac78c4 chore: bump atlasbot config checksum 2026-01-27 22:45:17 -03:00
23533e08ee atlasbot: refine cluster intent handling 2026-01-27 22:44:49 -03:00
fc10eed704 atlasbot: fix score formatting 2026-01-27 22:32:25 -03:00
ca7a08e791 monitoring: fix grafana smtp from address 2026-01-27 22:28:37 -03:00
868075426c atlasbot: overhaul open-ended reasoning 2026-01-27 22:22:50 -03:00
029e4d4ca6 monitoring: send grafana alerts via postmark 2026-01-27 22:00:19 -03:00
e97aaafed9 atlasbot: refine open-ended reasoning 2026-01-27 21:52:07 -03:00
38c8d08ab4 monitoring: fix gpu idle label 2026-01-27 21:46:58 -03:00
ba16f5119b monitoring: unify gpu namespace usage 2026-01-27 21:43:37 -03:00
2fe763189d atlasbot: roll pod after metric parsing update 2026-01-27 21:27:52 -03:00
832d5acf68 atlasbot: improve metric parsing and cluster intent 2026-01-27 21:27:19 -03:00
27e8a77044 atlasbot: add model fallback and rollout 2026-01-27 21:16:47 -03:00
65e50d1923 atlasbot: bump rollout checksum 2026-01-27 21:11:58 -03:00
e486245aaf atlasbot: guard open-ended LLM calls 2026-01-27 21:09:48 -03:00
34c91c6d08 atlasbot: refine open-ended reasoning pipeline 2026-01-27 21:02:20 -03:00
9e06d7afc8 atlasbot: route subjective queries to LLM 2026-01-27 20:02:09 -03:00
18e543d95a atlasbot: refine insight tone and status 2026-01-27 19:42:04 -03:00
20364a262c atlasbot: strengthen subjective insights 2026-01-27 19:37:20 -03:00
8842662239 atlasbot: refine node and postgres query handling 2026-01-27 19:13:31 -03:00
12fa7d02aa atlasbot: expand hardware and entity detection 2026-01-27 19:10:02 -03:00
9bf822ec36 atlasbot: answer hardware mix queries 2026-01-27 19:06:44 -03:00
ea8eda2c73 atlasbot: treat hardware prompts as cluster queries 2026-01-27 19:04:29 -03:00
243d3112ce atlasbot: prefer hardware for general interest 2026-01-27 19:01:16 -03:00
4bab34eae1 atlasbot: keep coolest answers opinionated 2026-01-27 18:58:59 -03:00
8bd4d9fc7a atlasbot: prioritize hardware for subjective prompts 2026-01-27 18:56:14 -03:00
69d121aa07 atlasbot: use hottest node labels for insights 2026-01-27 18:54:05 -03:00
79650616f1 atlasbot: make insights sound more human 2026-01-27 18:51:00 -03:00
c4ad82f122 atlasbot: add more opinionated hardware insight 2026-01-27 18:48:35 -03:00
4e51cf6b6c atlasbot: tighten insight phrasing 2026-01-27 18:45:49 -03:00
51bf01a8fd monitoring: keep idle label in gpu share 2026-01-27 18:44:58 -03:00
4e6d4f43b2 atlasbot: improve insight voice and avoid repeats 2026-01-27 18:43:03 -03:00
58dab1ca79 comms: roll atlasbot after history update 2026-01-27 18:32:54 -03:00
113bcdeded atlasbot: use history for subjective follow-ups 2026-01-27 18:32:27 -03:00
e05a949b9f comms: roll atlasbot for insight updates 2026-01-27 18:18:06 -03:00
0a10a2d861 atlasbot: add narrative insights 2026-01-27 18:17:29 -03:00
flux-bot
6fead623fa chore(bstein-dev-home): automated image update 2026-01-27 21:11:27 +00:00
flux-bot
ad01659cc4 chore(bstein-dev-home): automated image update 2026-01-27 21:11:24 +00:00
b04092b63c comms: roll atlasbot after bot updates 2026-01-27 18:10:30 -03:00
e87fa4369c atlasbot: make cluster answers more narrative 2026-01-27 18:08:19 -03:00
1b04e6cb00 monitoring: fix gpu idle share 2026-01-27 17:51:13 -03:00
5f32dff73b monitoring: fix tegrastats regexes 2026-01-27 16:44:00 -03:00
dfb295e5f0 monitoring: expose jetson scrape line length 2026-01-27 16:38:09 -03:00
a7f3d49fea monitoring: read tegrastats per scrape 2026-01-27 16:34:31 -03:00
246ed6617e monitoring: read jetson stats on demand 2026-01-27 16:27:45 -03:00
1951291090 monitoring: refresh jetson stats on scrape 2026-01-27 16:23:23 -03:00
62a423f32c monitoring: fix jetson gpu metrics 2026-01-27 16:19:54 -03:00
flux-bot
dedf566993 chore(maintenance): automated image update 2026-01-27 18:57:30 +00:00
354275f3ad atlasbot: avoid namespace-only workload matches 2026-01-27 15:45:18 -03:00
3f159c6c83 atlasbot: improve workload matching and fallbacks 2026-01-27 15:42:31 -03:00
631bd09778 atlasbot: return structured cluster summaries 2026-01-27 15:36:08 -03:00
b7792d30f1 atlasbot: answer cluster queries without llm 2026-01-27 15:30:43 -03:00
241a8889ee atlasbot: send snapshot as explicit context 2026-01-27 15:12:47 -03:00
864f1cab20 atlasbot: fix prompt formatting 2026-01-27 15:10:03 -03:00
dea70df209 atlasbot: strengthen cluster disambiguation 2026-01-27 15:07:28 -03:00
f649a6a9a2 atlasbot: force cluster intent in prompts 2026-01-27 15:04:10 -03:00
ca3cfaf1fc atlasbot: tighten cluster intent and snapshot framing 2026-01-27 15:00:55 -03:00
flux-bot
1682ccfb25 chore(bstein-dev-home): automated image update 2026-01-27 17:58:11 +00:00
flux-bot
18a4c58338 chore(bstein-dev-home): automated image update 2026-01-27 17:58:07 +00:00
92f4137e9c atlasbot: simplify cluster gating and context 2026-01-27 14:54:09 -03:00
cb7141dfb6 comms: roll atlasbot for mention stripping 2026-01-27 14:38:35 -03:00
cd45b7faba atlasbot: ignore mentions and gate cluster context 2026-01-27 14:38:35 -03:00
flux-bot
d03c846779 chore(bstein-dev-home): automated image update 2026-01-27 17:12:07 +00:00
flux-bot
a00bab5ee7 chore(bstein-dev-home): automated image update 2026-01-27 17:12:03 +00:00
975783a6b9 portal: allow longer atlasbot responses 2026-01-27 14:09:23 -03:00
c3b2c0cebb comms: roll atlasbot after answer tweaks 2026-01-27 13:18:01 -03:00
d2ade61d88 atlasbot: refine ready/pod counts 2026-01-27 13:17:33 -03:00
d74277a8bd comms: roll atlasbot after script update 2026-01-27 13:15:13 -03:00
31fbe48ca3 atlasbot: fix metric detection and role counts 2026-01-27 13:13:20 -03:00
70feb1ef85 atlasbot: refine role and hardware filters 2026-01-27 13:02:23 -03:00
159c9cfe68 atlasbot: use structured answers before LLM 2026-01-27 12:59:11 -03:00
b7f454b790 atlasbot: enrich snapshot facts and pod metrics 2026-01-27 12:53:17 -03:00
41b131c347 atlasbot: preserve response text with confidence 2026-01-27 12:47:28 -03:00
3b1e74d278 atlasbot: call ollama chat directly 2026-01-27 12:33:56 -03:00
d8ae9c5901 comms: restore atlasbot gateway URL 2026-01-27 12:23:05 -03:00
32851ca057 comms: point atlasbot to ollama and raise gateway memory 2026-01-27 12:20:50 -03:00
32125d7bab comms: bump atlasbot configmap checksum 2026-01-27 11:05:30 -03:00
a442ea6d5d atlasbot: strengthen facts context and replies 2026-01-27 11:03:55 -03:00
c0dd00c93d atlasbot: shrink facts context to avoid truncation 2026-01-27 06:45:18 -03:00
446115f07a atlasbot: enrich facts summary for LLM 2026-01-27 06:34:37 -03:00
a2f4c51e1d atlasbot: shift to facts context and upgrade model 2026-01-27 06:28:26 -03:00
flux-bot
4fcecc4707 chore(maintenance): automated image update 2026-01-27 09:00:40 +00:00
flux-bot
1459027abc chore(maintenance): automated image update 2026-01-27 08:50:29 +00:00
89935a579a atlasbot: use cluster snapshot + model update 2026-01-27 05:42:28 -03:00
flux-bot
b1aad04f3e chore(maintenance): automated image update 2026-01-27 08:14:36 +00:00
2dc208e919 comms: retain synapse admin ensure logs 2026-01-27 05:02:02 -03:00
292d513e10 comms: ensure synapse admin token 2026-01-27 04:58:13 -03:00
11ba37a4b2 comms: restart atlasbot for scoped hottest 2026-01-27 04:53:44 -03:00
d6b9d64e70 atlasbot: scope overall hottest node to atlas inventory 2026-01-27 04:53:33 -03:00
67b9babc0e comms: restart atlasbot for knowledge summaries 2026-01-27 04:51:33 -03:00
c219019ad5 atlasbot: add knowledge summaries and better fallback 2026-01-27 04:51:20 -03:00
0ef14c67fd comms: add synapse admin ensure job 2026-01-27 04:48:44 -03:00
39fd7adb55 comms: restart atlasbot for metrics formatting 2026-01-27 03:56:47 -03:00
600c124ef2 atlasbot: clarify scoped metrics and format percent values 2026-01-27 03:56:17 -03:00
flux-bot
5e4a974733 chore(maintenance): automated image update 2026-01-27 06:51:28 +00:00
f7fc152439 comms: rerun synapse seeder admin ensure 2026-01-27 01:22:02 -03:00
bab914c58f comms: rerun mas local user ensure 2026-01-27 01:19:43 -03:00
e24ff4782c comms: rerun ensure jobs and fix vault oidc env 2026-01-27 01:14:42 -03:00
9ecdf054d3 vault: bootstrap k8s auth config with root token 2026-01-27 01:04:57 -03:00
flux-bot
d9c8632b8d chore(bstein-dev-home): automated image update 2026-01-27 02:53:50 +00:00
flux-bot
d325111f34 chore(bstein-dev-home): automated image update 2026-01-27 02:52:49 +00:00
adc711be62 comms: rerun synapse user seed 2026-01-26 22:54:43 -03:00
66ce0caaf4 comms: restart atlasbot for op priority 2026-01-26 22:52:49 -03:00
9ea338b121 monitoring: restart jetson exporter 2026-01-26 22:51:41 -03:00
270dc93966 atlasbot: prioritize top queries over list 2026-01-26 22:51:04 -03:00
0331e7ea99 monitoring: fix jetson metrics newlines 2026-01-26 22:50:33 -03:00
flux-bot
f08d740d83 chore(bstein-dev-home): automated image update 2026-01-27 01:47:47 +00:00
flux-bot
328241b7ac chore(bstein-dev-home): automated image update 2026-01-27 01:47:43 +00:00
c8662a624e atlasbot: add internal endpoint and portal wiring 2026-01-26 22:43:58 -03:00
689bf10995 comms: restart atlasbot for generic planner 2026-01-26 22:39:01 -03:00
37a203509b atlasbot: replace targeted handlers with generic planner 2026-01-26 22:38:37 -03:00
flux-bot
6c413d4a50 chore(maintenance): automated image update 2026-01-27 01:27:02 +00:00
1616994b19 monitoring: unify jetson gpu metrics 2026-01-26 22:26:24 -03:00
ec834b7e0f vault: allow ariadne to use vault-admin role 2026-01-26 22:26:13 -03:00
8c90e0e527 comms: restart atlasbot for hottest node fix 2026-01-26 22:13:53 -03:00
6432472be7 atlasbot: answer hottest node queries via metrics 2026-01-26 22:13:04 -03:00
72bd22e912 monitoring: map dcgm to shared gpu resources 2026-01-26 20:58:06 -03:00
flux-bot
879a751429 chore(maintenance): automated image update 2026-01-26 23:54:53 +00:00
b0abb9bd6e ariadne: reduce comms noise, fix gpu labels 2026-01-26 20:54:33 -03:00
b27c80d5c0 atlasbot: improve node inventory reasoning 2026-01-26 19:53:11 -03:00
a61091c052 atlasbot: reload structured answers 2026-01-26 19:34:42 -03:00
16d0a22163 atlasbot: generalize inventory answers 2026-01-26 19:34:19 -03:00
2d09e7f965 atlasbot: reload inventory answers 2026-01-26 19:31:07 -03:00
bf2d4cff90 atlasbot: answer from live inventory 2026-01-26 19:29:26 -03:00
3e4351ef19 atlasbot: reload for live inventory 2026-01-26 19:24:03 -03:00
ff04341559 atlasbot: use live node inventory context 2026-01-26 19:22:28 -03:00
d666e6a156 atlasbot: roll deployment 2026-01-26 19:02:54 -03:00
b6e8c01e99 atlasbot: improve missing node inference 2026-01-26 19:01:26 -03:00
0d5e19e11a atlasbot: infer worker expected count from metrics 2026-01-26 18:50:23 -03:00
dfa13e22cc atlasbot: clarify worker count limits 2026-01-26 18:21:17 -03:00
65781aaca7 atlasbot: improve worker node answers 2026-01-26 18:18:42 -03:00
7bb1bd96fc atlasbot: improve worker readiness and metrics replies 2026-01-26 18:16:14 -03:00
be7846572f atlasbot: recognize prefix mentions 2026-01-26 15:54:00 -03:00
0ac0f920ca atlasbot: load metrics index and answer in rooms 2026-01-26 15:34:52 -03:00
33b5e2b678 atlasbot: add metrics kb and long timeout 2026-01-26 14:08:11 -03:00
fff00dbe95 atlasbot: ground node inventory and soften llm failures 2026-01-26 12:36:51 -03:00
53e4b4036b comms: bump atlasbot config checksum 2026-01-26 12:08:33 -03:00
28570a1f5c atlasbot: answer jetson nodes from knowledge 2026-01-26 12:06:48 -03:00
2c3ffdbf95 ai-llm: tighten gpu placement and resources 2026-01-26 11:44:28 -03:00
fec7713049 comms: bump atlasbot configmap checksum 2026-01-26 09:38:38 -03:00
352d4991f4 comms: handle arch node counts and extend LLM timeout 2026-01-26 09:36:08 -03:00
14d18048d5 comms: fix duplicate chat key annotations 2026-01-26 09:29:28 -03:00
7fd71f4bab comms: inject chat ai keys for atlasbot 2026-01-26 09:23:21 -03:00
flux-bot
f14be5d7ef chore(maintenance): automated image update 2026-01-26 06:33:26 +00:00
10003ca0d7 comms: sync atlas knowledge and use ariadne state 2026-01-26 03:32:17 -03:00
5aac018a7b comms: answer node name queries 2026-01-26 01:35:47 -03:00
36f7de76e9 comms: fix atlasbot node count matcher 2026-01-26 01:32:01 -03:00
5f0bc3832d comms: answer node count queries 2026-01-26 01:07:49 -03:00
cd6eaff7cb comms: normalize atlasbot replies 2026-01-26 00:52:35 -03:00
83b8e13661 ai: restart ollama deployment 2026-01-25 16:19:15 -03:00
ec6b51cfd2 comms: route atlasbot to chat gateway 2026-01-25 15:59:34 -03:00
flux-bot
04465407d2 chore(bstein-dev-home): automated image update 2026-01-25 18:06:59 +00:00
flux-bot
5a994f4d42 chore(bstein-dev-home): automated image update 2026-01-25 18:04:59 +00:00
flux-bot
af9fcdeae9 chore(bstein-dev-home): automated image update 2026-01-25 17:40:57 +00:00
flux-bot
39df6ff039 chore(bstein-dev-home): automated image update 2026-01-25 17:39:57 +00:00
flux-bot
70e79f25b0 chore(bstein-dev-home): automated image update 2026-01-25 00:07:26 +00:00
flux-bot
f471a30499 chore(bstein-dev-home): automated image update 2026-01-25 00:06:26 +00:00
ee154f1494 vaultwarden: bump to 1.35.2 2026-01-24 14:16:59 -03:00
flux-bot
d0c69cd480 chore(bstein-dev-home): automated image update 2026-01-24 14:46:38 +00:00
flux-bot
6e4e2bdc0c chore(bstein-dev-home): automated image update 2026-01-24 14:44:38 +00:00
flux-bot
0b7d87cef4 chore(bstein-dev-home): automated image update 2026-01-24 14:32:37 +00:00
flux-bot
a27bb0e198 chore(bstein-dev-home): automated image update 2026-01-24 14:31:37 +00:00
flux-bot
cf2d0c5eff chore(bstein-dev-home): automated image update 2026-01-24 10:16:15 +00:00
flux-bot
00eb4be529 chore(bstein-dev-home): automated image update 2026-01-24 10:15:15 +00:00
flux-bot
8b1b824a29 chore(maintenance): automated image update 2026-01-24 10:13:43 +00:00
flux-bot
a7f5a60190 chore(maintenance): automated image update 2026-01-24 09:29:39 +00:00
flux-bot
eeb84e8e70 chore(bstein-dev-home): automated image update 2026-01-24 02:07:32 +00:00
flux-bot
82312d0fbf chore(bstein-dev-home): automated image update 2026-01-24 02:05:32 +00:00
292ec7359b keycloak: rerun realm settings job 2026-01-23 22:41:41 -03:00
flux-bot
473bebaf52 chore(bstein-dev-home): automated image update 2026-01-24 01:33:33 +00:00
flux-bot
d07f14826b chore(bstein-dev-home): automated image update 2026-01-24 01:33:29 +00:00
e7d18be4ed keycloak: add vaultwarden_grandfathered flag 2026-01-23 22:31:10 -03:00
flux-bot
437281f6a5 chore(bstein-dev-home): automated image update 2026-01-23 23:53:21 +00:00
flux-bot
67643e3fad chore(bstein-dev-home): automated image update 2026-01-23 23:52:21 +00:00
flux-bot
38d2dad28f chore(bstein-dev-home): automated image update 2026-01-23 23:28:28 +00:00
flux-bot
82fceb11a4 chore(bstein-dev-home): automated image update 2026-01-23 23:28:20 +00:00
flux-bot
8e6d9e1c37 chore(bstein-dev-home): automated image update 2026-01-23 23:19:21 +00:00
flux-bot
a603b3726f chore(bstein-dev-home): automated image update 2026-01-23 23:19:18 +00:00
flux-bot
e43340f2a1 chore(bstein-dev-home): automated image update 2026-01-23 22:40:15 +00:00
flux-bot
115f86907f chore(bstein-dev-home): automated image update 2026-01-23 22:39:15 +00:00
flux-bot
aaef2b7ab5 chore(bstein-dev-home): automated image update 2026-01-23 22:25:15 +00:00
flux-bot
c24f2dafc1 chore(bstein-dev-home): automated image update 2026-01-23 22:24:13 +00:00
flux-bot
d9c3ff8195 chore(maintenance): automated image update 2026-01-23 22:21:43 +00:00
b94b016b0f flux: force apply migrations 2026-01-23 18:58:33 -03:00
flux-bot
5ec4bb9c61 chore(maintenance): automated image update 2026-01-23 21:44:40 +00:00
flux-bot
e2501bd3d0 chore(bstein-dev-home): automated image update 2026-01-23 21:28:08 +00:00
flux-bot
bc2e1058d6 chore(bstein-dev-home): automated image update 2026-01-23 21:27:08 +00:00
flux-bot
45352f79ba chore(bstein-dev-home): automated image update 2026-01-23 20:51:05 +00:00
flux-bot
7b336c76a1 chore(bstein-dev-home): automated image update 2026-01-23 20:50:05 +00:00
flux-bot
0127c62f51 chore(bstein-dev-home): automated image update 2026-01-23 20:48:05 +00:00
flux-bot
ee6ef74982 chore(bstein-dev-home): automated image update 2026-01-23 20:47:05 +00:00
d521c66d60 maintenance: rotate ariadne migrate job name 2026-01-23 17:21:37 -03:00
flux-bot
c28444a233 chore(bstein-dev-home): automated image update 2026-01-23 20:00:01 +00:00
flux-bot
8bdf60542d chore(bstein-dev-home): automated image update 2026-01-23 19:58:00 +00:00
flux-bot
0758c2e06d chore(maintenance): automated image update 2026-01-23 19:56:31 +00:00
flux-bot
00bcc0d4c2 chore(bstein-dev-home): automated image update 2026-01-23 19:13:56 +00:00
flux-bot
60840d1171 chore(bstein-dev-home): automated image update 2026-01-23 19:11:58 +00:00
3338efa58e finance: allow actual user creation 2026-01-23 14:07:52 -03:00
a988af3262 monitoring: alert on VM outage 2026-01-23 11:51:28 -03:00
flux-bot
ef42dac97b chore(bstein-dev-home): automated image update 2026-01-23 06:45:19 +00:00
flux-bot
df3f4a0c0b chore(bstein-dev-home): automated image update 2026-01-23 06:44:18 +00:00
fda986ab3d bstein-dev-home: separate portal migrations 2026-01-23 03:28:49 -03:00
flux-bot
ca47e03953 chore(bstein-dev-home): automated image update 2026-01-23 06:14:16 +00:00
flux-bot
3d4208f877 chore(bstein-dev-home): automated image update 2026-01-23 06:13:15 +00:00
3d2e0ead1c portal: bump migrate job name 2026-01-23 03:11:42 -03:00
18ac46d4b8 keycloak: bump realm settings job 2026-01-23 02:09:53 -03:00
3cacbad4c0 comms/keycloak: add mailu email claim 2026-01-23 02:04:51 -03:00
3d633a5627 comms: enable MSC4108 under experimental_features 2026-01-23 01:46:03 -03:00
58d9cb616f comms: enable MSC4108 rendezvous in synapse 2026-01-23 01:35:43 -03:00
flux-bot
3474df40d4 chore(bstein-dev-home): automated image update 2026-01-23 03:39:02 +00:00
flux-bot
4c66b538a7 chore(bstein-dev-home): automated image update 2026-01-23 03:38:02 +00:00
flux-bot
2475d4ca9d chore(bstein-dev-home): automated image update 2026-01-23 03:11:03 +00:00
flux-bot
1d39015d33 chore(bstein-dev-home): automated image update 2026-01-23 03:10:59 +00:00
flux-bot
e0bf10cad9 chore(bstein-dev-home): automated image update 2026-01-23 03:02:59 +00:00
flux-bot
72e6a09bd0 chore(bstein-dev-home): automated image update 2026-01-23 03:01:59 +00:00
flux-bot
b1fa40acc1 chore(bstein-dev-home): automated image update 2026-01-23 02:47:58 +00:00
flux-bot
e3247f606f chore(bstein-dev-home): automated image update 2026-01-23 02:46:57 +00:00
flux-bot
2dc680b8f8 chore(bstein-dev-home): automated image update 2026-01-23 01:52:53 +00:00
flux-bot
8dedefb4b4 chore(bstein-dev-home): automated image update 2026-01-23 01:51:53 +00:00
flux-bot
a18f7e98a2 chore(bstein-dev-home): automated image update 2026-01-23 01:42:52 +00:00
flux-bot
62d16ae388 chore(bstein-dev-home): automated image update 2026-01-23 01:32:51 +00:00
flux-bot
d3d680383b chore(bstein-dev-home): automated image update 2026-01-23 01:14:49 +00:00
flux-bot
8545f2bc50 chore(bstein-dev-home): automated image update 2026-01-23 01:12:49 +00:00
flux-bot
5ca247f143 chore(bstein-dev-home): automated image update 2026-01-23 01:08:49 +00:00
flux-bot
4d566a7388 chore(bstein-dev-home): automated image update 2026-01-23 01:07:49 +00:00
flux-bot
8913c5a5f2 chore(bstein-dev-home): automated image update 2026-01-22 22:16:37 +00:00
flux-bot
25c4f3e07b chore(bstein-dev-home): automated image update 2026-01-22 22:16:34 +00:00
flux-bot
8b7e21f0cc chore(bstein-dev-home): automated image update 2026-01-22 22:08:37 +00:00
flux-bot
301909f92e chore(bstein-dev-home): automated image update 2026-01-22 22:08:33 +00:00
flux-bot
0c27b48a1c chore(bstein-dev-home): automated image update 2026-01-22 21:53:32 +00:00
flux-bot
71996fb199 chore(bstein-dev-home): automated image update 2026-01-22 21:51:32 +00:00
flux-bot
7c9ee41180 chore(maintenance): automated image update 2026-01-22 21:41:04 +00:00
ce5b1d1353 monitoring: add postgres metrics and update overview 2026-01-22 18:23:26 -03:00
820e624a0b jenkins: set timezone to America/Chicago 2026-01-22 18:23:26 -03:00
flux-bot
cca3a756b3 chore(maintenance): automated image update 2026-01-22 21:02:01 +00:00
flux-bot
1e815ce011 chore(bstein-dev-home): automated image update 2026-01-22 21:00:34 +00:00
flux-bot
e5281ad4c0 chore(bstein-dev-home): automated image update 2026-01-22 21:00:29 +00:00
flux-bot
1e8a67904c chore(bstein-dev-home): automated image update 2026-01-22 18:48:16 +00:00
flux-bot
0290a5f715 chore(bstein-dev-home): automated image update 2026-01-22 18:47:16 +00:00
9b5d8ac45c jobs: force recreate migrate jobs 2026-01-22 15:39:57 -03:00
flux-bot
05c7642f5c chore(bstein-dev-home): automated image update 2026-01-22 18:35:20 +00:00
flux-bot
efa893b134 chore(bstein-dev-home): automated image update 2026-01-22 18:35:15 +00:00
flux-bot
7eba40a889 chore(bstein-dev-home): automated image update 2026-01-22 18:34:08 +00:00
flux-bot
8b90b44dfd chore(maintenance): automated image update 2026-01-22 18:33:48 +00:00
flux-bot
21800290ec chore(maintenance): automated image update 2026-01-22 18:33:30 +00:00
ec5e4ec4a3 images: auth image scan and bump tags 2026-01-22 15:33:08 -03:00
flux-bot
af024aa16a chore(maintenance): automated image update 2026-01-22 18:29:24 +00:00
flux-bot
da32ba1680 chore(bstein-dev-home): automated image update 2026-01-22 18:29:01 +00:00
8788d40dc6 ops: bump portal and ariadne image tags 2026-01-22 15:28:26 -03:00
d509dfaa22 ops: restore portal/ariadne and add postgres panels 2026-01-22 15:23:23 -03:00
156effebe3 ops: pause portal/ariadne and add migrate jobs 2026-01-22 14:09:39 -03:00
8e3fe266aa flux: temporarily drop harbor health checks 2026-01-22 13:38:06 -03:00
3fc17b0c7c harbor: fix ingress patch placement 2026-01-22 13:31:12 -03:00
d9695d32f6 harbor: route v2 ingress to registry 2026-01-22 13:26:38 -03:00
0697d7b1b3 keycloak: allow harbor direct grants 2026-01-22 12:41:58 -03:00
d2f118ed32 jenkins: pin vault sync to worker nodes 2026-01-22 10:56:27 -03:00
5e35b5f7a2 vault: unsuspend k8s auth config cronjob 2026-01-22 04:47:50 -03:00
94953ab0fe jenkins: sync harbor pull secret from vault 2026-01-22 04:45:24 -03:00
ba2b9acbcc jenkins: use shared harbor creds when present 2026-01-22 03:15:38 -03:00
flux-bot
955bbcf58f chore(bstein-dev-home): automated image update 2026-01-22 05:41:20 +00:00
flux-bot
62c0e32bc4 chore(bstein-dev-home): automated image update 2026-01-22 05:40:21 +00:00
flux-bot
6dcbdcf704 chore(bstein-dev-home): automated image update 2026-01-22 05:38:20 +00:00
flux-bot
c84af0b8df chore(bstein-dev-home): automated image update 2026-01-22 05:37:20 +00:00
flux-bot
3891f1d063 chore(maintenance): automated image update 2026-01-22 00:59:59 +00:00
flux-bot
beb923cf0e chore(maintenance): automated image update 2026-01-22 00:48:58 +00:00
flux-bot
aa3db22eaf chore(bstein-dev-home): automated image update 2026-01-22 00:17:42 +00:00
flux-bot
592435f760 chore(bstein-dev-home): automated image update 2026-01-22 00:16:42 +00:00
flux-bot
d54115df55 chore(bstein-dev-home): automated image update 2026-01-21 23:48:39 +00:00
flux-bot
75e2c745f7 chore(bstein-dev-home): automated image update 2026-01-21 23:47:39 +00:00
flux-bot
71122fc200 chore(bstein-dev-home): automated image update 2026-01-21 23:24:40 +00:00
flux-bot
41d38033b5 chore(bstein-dev-home): automated image update 2026-01-21 23:24:37 +00:00
flux-bot
067134fa1b chore(bstein-dev-home): automated image update 2026-01-21 22:56:34 +00:00
flux-bot
eb5256e6bc chore(bstein-dev-home): automated image update 2026-01-21 22:55:34 +00:00
flux-bot
d3b1a925b8 chore(maintenance): automated image update 2026-01-21 22:52:46 +00:00
flux-bot
6f4e5dbfe7 chore(bstein-dev-home): automated image update 2026-01-21 22:32:31 +00:00
flux-bot
d9cda5b6af chore(bstein-dev-home): automated image update 2026-01-21 22:30:31 +00:00
flux-bot
30b86a693f chore(maintenance): automated image update 2026-01-21 22:23:44 +00:00
flux-bot
da16998d2e chore(bstein-dev-home): automated image update 2026-01-21 22:07:29 +00:00
flux-bot
3a48569330 chore(bstein-dev-home): automated image update 2026-01-21 22:05:29 +00:00
flux-bot
3a987c29ff chore(bstein-dev-home): automated image update 2026-01-21 20:34:18 +00:00
flux-bot
66cb72947f chore(bstein-dev-home): automated image update 2026-01-21 20:33:18 +00:00
flux-bot
1039590b14 chore(bstein-dev-home): automated image update 2026-01-21 20:05:15 +00:00
flux-bot
298d261146 chore(bstein-dev-home): automated image update 2026-01-21 20:04:15 +00:00
4721d44a33 monitoring: enforce sorted job lists 2026-01-21 15:12:53 -03:00
db4c3b7c51 monitoring: tighten jobs/overview ordering 2026-01-21 15:01:02 -03:00
b0996e9a4f monitoring: refine jobs/overview panels 2026-01-21 14:31:11 -03:00
flux-bot
2138b93242 chore(maintenance): automated image update 2026-01-21 16:40:09 +00:00
8b35ab0292 monitoring: refresh jobs dashboards 2026-01-21 13:37:36 -03:00
2e407e1962 monitoring: reschedule grafana user dedupe 2026-01-21 12:31:54 -03:00
5ae6b4b00c monitoring: harden grafana user dedupe 2026-01-21 12:30:08 -03:00
ae1fd5b661 monitoring: fix grafana user dedupe job 2026-01-21 12:25:53 -03:00
4e65f02fba monitoring: prepopulate vault for dedupe job 2026-01-21 12:18:57 -03:00
88de0f7cee monitoring: wire vault sa for dedupe job 2026-01-21 12:16:26 -03:00
08716c6be6 monitoring: use python dedupe job 2026-01-21 12:15:03 -03:00
a0caeb407c monitoring: dedupe grafana user via api 2026-01-21 12:11:28 -03:00
6eeb551239 monitoring: add grafana user dedupe job 2026-01-21 12:08:23 -03:00
98b063f2dd grafana: allow email-based oauth user lookup 2026-01-21 11:45:11 -03:00
698b2fd96b monitoring: refresh testing dashboard 2026-01-21 11:29:48 -03:00
flux-bot
a9f6b04baa chore(maintenance): automated image update 2026-01-21 14:04:54 +00:00
flux-bot
d8a3b5250e chore(bstein-dev-home): automated image update 2026-01-21 13:36:39 +00:00
flux-bot
4484fed039 chore(maintenance): automated image update 2026-01-21 13:35:55 +00:00
7cf5e7e39d flux: simplify image automation messages 2026-01-21 10:35:29 -03:00
4de4630911 flux: fix image automation templates 2026-01-21 10:34:25 -03:00
6ac3b41b30 flux: align image automation namespaces 2026-01-21 10:33:06 -03:00
810e4c0efb flux: align imagepolicy tag setters 2026-01-21 10:20:53 -03:00
5e4ed17942 maintenance: bump ariadne image tag 2026-01-21 05:03:26 -03:00
a41ac1548c maintenance: fix ariadne comms endpoints and exec RBAC 2026-01-21 04:05:41 -03:00
b87fe4899c maintenance: bump ariadne image tag 2026-01-21 03:53:34 -03:00
0efc1ed6c4 ariadne: split portal and ariadne db secrets 2026-01-21 03:39:17 -03:00
439d824300 vault: allow ariadne to read needed secrets 2026-01-21 03:21:01 -03:00
80a7ec26e2 rbac: allow ariadne to read cronjobs 2026-01-21 03:05:53 -03:00
0d4f14c397 keycloak: bump realm settings job name 2026-01-21 03:03:32 -03:00
fb6ddce0c7 glue: centralize sync tasks in ariadne 2026-01-21 02:57:40 -03:00
1fedb5ecbe maintenance: wire ariadne db and dashboards 2026-01-20 23:03:39 -03:00
0bb45bca83 jenkins: fix dark theme injection 2026-01-20 18:13:49 -03:00
c846d2c1ba ci: add root Jenkinsfile and update keycloak ldap job 2026-01-20 18:11:13 -03:00
163f98c594 jenkins: inline dark theme css 2026-01-20 18:00:36 -03:00
954d0d36b9 jenkins: mount init scripts into home 2026-01-20 17:54:47 -03:00
6db7521114 jenkins: add local dark theme css 2026-01-20 17:43:23 -03:00
13891e794a jenkins: rotate cache/plugin pvcs 2026-01-20 17:32:27 -03:00
1522b7a019 jenkins: keep cache/plugin pvc sizes to avoid shrink 2026-01-20 17:21:42 -03:00
5c40efdbcc jenkins: right-size pvc requests 2026-01-20 17:19:58 -03:00
9ac66919d5 jenkins: expand pvc sizes and move /tmp to memory 2026-01-20 17:09:23 -03:00
c80f26625d jenkins: move agent workspace off node disk 2026-01-20 17:04:24 -03:00
f5eec19e11 jenkins: automate notifyCommit token 2026-01-20 11:54:15 -03:00
b54da8e3e0 jenkins: fix scmTrigger spec field 2026-01-20 11:23:06 -03:00
9f6824ad56 jenkins: use scmTrigger for pipeline polls 2026-01-20 11:14:29 -03:00
0d3c5eb976 jenkins: use pollSCM for pipeline triggers 2026-01-20 11:07:54 -03:00
9cdf244d98 jenkins: drop legacy cleanup and update triggers 2026-01-20 10:59:51 -03:00
36ae49f1fc jenkins: clean legacy quality-gate job 2026-01-20 10:37:57 -03:00
b8d8240383 jenkins: fix webhook trigger DSL 2026-01-20 10:31:30 -03:00
fe30570b62 jenkins: pin oic-auth for core 2.528.3 2026-01-20 10:23:08 -03:00
8e9db51f9d jenkins: restore multibranch + webhook token 2026-01-20 10:15:33 -03:00
ea6e600007 jenkins: drop removed multibranch plugin 2026-01-20 09:45:33 -03:00
b8f2d00547 jenkins: pin root url for OIDC 2026-01-20 09:37:21 -03:00
132074f0ff gitea: allow jenkins webhook 2026-01-20 09:06:39 -03:00
56b36330b2 glue: preserve keycloak profile updates 2026-01-20 03:59:19 -03:00
557663f524 ci(jenkins): add Ariadne pipeline job 2026-01-20 03:30:48 -03:00
5fe8866623 ci(jenkins): add multibranch quality gate 2026-01-20 03:21:36 -03:00
e2e7e58f32 maintenance: extend Ariadne schedules and RBAC 2026-01-20 03:01:59 -03:00
95a7ac235f mailu: restart postfix after canonical map update 2026-01-20 02:38:04 -03:00
814d1ce211 mailu: keep podop socketmap in canonical maps 2026-01-20 02:37:02 -03:00
d996bda2c1 mailu: restart postfix to load canonical map 2026-01-20 02:32:43 -03:00
2bbbf019ff mailu: rewrite double-bounce to base domain 2026-01-20 02:30:44 -03:00
34fb371270 portal: rerun onboarding e2e job 2026-01-20 01:20:16 -03:00
14864a3b8c jenkins: align quality gate branch 2026-01-20 01:14:30 -03:00
cfcda87f67 jenkins: re-target quality gate and restart 2026-01-20 01:08:51 -03:00
cac8a3cdde mailu: recreate postfix on upgrade 2026-01-20 01:07:01 -03:00
3e0260b945 ci: pin quality gate agents to rpi5 2026-01-20 01:05:06 -03:00
a8be46b422 mailu: prefer postmark smtp token for relay 2026-01-20 01:04:04 -03:00
a86d68ca74 mailu: use postmark server token for relay 2026-01-20 00:58:04 -03:00
f527da9cdb chore(portal): rerun onboarding e2e 2026-01-20 00:09:49 -03:00
8be01698a9 chore(maintenance): bump ariadne image tag 2026-01-20 00:07:45 -03:00
278b4541a2 chore(portal): rerun onboarding e2e 2026-01-19 23:58:37 -03:00
7d999cc6c6 fix(mailu): pin sync workloads to arm64 2026-01-19 23:51:55 -03:00
cffe53edbe chore(portal): rerun onboarding e2e 2026-01-19 23:47:24 -03:00
1b2243e2a8 chore(maintenance): bump ariadne image tag 2026-01-19 23:45:48 -03:00
34c42cfb62 core: fix postmark DNS and time sync 2026-01-19 23:45:31 -03:00
84cd05b08a chore(portal): rerun onboarding e2e 2026-01-19 23:31:45 -03:00
9ff88f7f13 fix(mailu): allow forced sync 2026-01-19 23:28:07 -03:00
901f3e797c chore(portal): rerun onboarding e2e 2026-01-19 23:05:46 -03:00
4b0d8fb301 chore(maintenance): bump ariadne image tag 2026-01-19 23:04:59 -03:00
c1f0ea421d fix: extend mailu mailbox wait for ariadne 2026-01-19 22:49:23 -03:00
67e422f56f chore: rerun portal onboarding e2e 2026-01-19 22:42:14 -03:00
c7e81674b0 fix: point portal at ariadne service 2026-01-19 22:38:22 -03:00
cff3ed0759 chore: run portal onboarding e2e job 2026-01-19 22:35:29 -03:00
7171e5a9ea fix: unblock keycloak and refresh glue checks 2026-01-19 22:33:34 -03:00
776aea25f5 bstein-dev-home: bump images to 0.1.1-107 2026-01-19 22:11:38 -03:00
fbdf53a9a8 chore: add maintenance image automation 2026-01-19 22:03:50 -03:00
a0c3b9f953 feat: wire portal to ariadne 2026-01-19 19:22:53 -03:00
61619ddf77 fix: allow maintenance vault sync role 2026-01-19 19:07:00 -03:00
ff3ed195ac chore: centralize harbor pull credentials 2026-01-19 19:02:14 -03:00
bb41c219f6 feat: add Ariadne service and glue scheduling 2026-01-19 16:58:02 -03:00
791108723e flux: point atlas to feature/ariadne 2026-01-19 16:16:04 -03:00
c4ce7e3981 Merge pull request 'deploy' (#10) from deploy into main
Reviewed-on: #10
2026-01-19 19:03:59 +00:00
2c546f8eae Merge main into deploy 2026-01-19 16:03:29 -03:00
b09679a812 mailu-sync: bump job 2026-01-19 02:45:19 -03:00
89316a5901 vaultwarden: use mail hostname 2026-01-19 02:31:41 -03:00
35816115f8 vault: allow vaultwarden mailu secret 2026-01-19 02:23:16 -03:00
2802c1e8b6 vaultwarden: use mailu smtp creds 2026-01-19 02:17:16 -03:00
d943359606 mailu-sync: restart listener for update 2026-01-19 01:57:49 -03:00
21899b8a79 portal: tune vaultwarden backoff 2026-01-19 01:53:25 -03:00
bed3563ae6 mailu-sync: cap wait in listener 2026-01-19 01:53:13 -03:00
d5a19ca9c3 portal-e2e: add readiness checks 2026-01-19 01:40:42 -03:00
f4b08b93eb mailu: add portal sender mailbox 2026-01-19 01:40:27 -03:00
aaf7e23603 portal: allow firefly sync jobs 2026-01-19 01:21:56 -03:00
67203d1147 nextcloud-mail-sync: pin to arm64 workers 2026-01-19 01:14:29 -03:00
6935de7a6c portal: use mailu sender mailbox 2026-01-19 01:04:08 -03:00
fe9132e45e portal: use mailu smtp secret 2026-01-19 00:56:07 -03:00
b6609a9706 glue: fix portal smtp host and mail sync export 2026-01-19 00:37:42 -03:00
73c829c81f jenkins: restart to load new jobs 2026-01-18 21:26:05 -03:00
979470eeb8 ci: add glue tests and deploy gate 2026-01-18 21:23:11 -03:00
da200235bb monitoring: fix glue dashboard queries 2026-01-18 12:26:04 -03:00
ae3b0afbff nextcloud-mail-sync: harden auth, bump portal backend 2026-01-18 12:23:50 -03:00
0eb526c907 monitoring: label cronjob metrics and move grafana to arm64 2026-01-18 12:20:45 -03:00
c70054a30e monitoring: add atlas testing dashboard folder 2026-01-18 12:07:45 -03:00
084242746e monitoring: keep postmark exporter off titan-22 2026-01-18 11:52:36 -03:00
a5bec3e543 monitoring: avoid titan-22 for core pods 2026-01-18 11:43:28 -03:00
6e3faeb9fd monitoring: restore grafana persistence 2026-01-18 11:37:01 -03:00
0b15007e2c monitoring: disable grafana persistence to recover 2026-01-18 09:55:28 -03:00
435ed5d426 keycloak: bump jobs for postmark change 2026-01-18 09:27:18 -03:00
1fb3d179ef monitoring: add testing dashboard and switch postmark apikey 2026-01-18 09:21:33 -03:00
d7812623cd monitoring: add glue row and fix mail dns 2026-01-18 08:12:06 -03:00
4874ccda4d vaultwarden: pin to arm64 workers 2026-01-18 03:09:40 -03:00
8b8d2c4aa8 vaultwarden: add retry safeguards and db tuning 2026-01-18 03:00:24 -03:00
343d41ecc7 monitoring: add glue dashboard and tag cronjobs 2026-01-18 02:50:07 -03:00
a6ac0c363e nextcloud-mail-sync: harden keycloak fetch 2026-01-18 02:37:26 -03:00
0d27107411 mailu: backfill mailu_enabled for legacy users 2026-01-18 02:03:13 -03:00
c9cb088198 keycloak: rerun realm settings job 2026-01-18 01:58:17 -03:00
7cd2f3c587 vault: allow portal to read postmark relay 2026-01-18 01:17:52 -03:00
4c4c0867a7 bstein-dev-home: add smtp env for access requests 2026-01-18 01:14:15 -03:00
9c2cb1b037 mailu: preserve keycloak profile fields 2026-01-18 01:08:31 -03:00
418d201da0 mailu: gate sync to approved users 2026-01-18 00:47:38 -03:00
f753f114c7 bstein-dev-home: bump images to 0.1.1-102 2026-01-18 00:44:11 -03:00
74f089dc21 bstein-dev-home: bump images to 0.1.1-101 2026-01-18 00:33:09 -03:00
a9b94c87be comms: route live host login to mas 2026-01-17 20:49:11 -03:00
792b7b1417 comms: rerun mas local users and secrets jobs 2026-01-17 20:30:13 -03:00
0ddbb5ec79 comms: restart mas after db ensure 2026-01-17 20:27:11 -03:00
e64ba4ca3c comms: re-run mas db ensure 2026-01-17 20:23:32 -03:00
758610dff0 core: pin coredns to rpi workers 2026-01-17 20:15:51 -03:00
b576da53c2 comms: pin livekit token hostnames 2026-01-17 19:49:19 -03:00
f91459e55a comms: restart livekit to reload vault keys 2026-01-17 19:32:04 -03:00
e729adc6ef comms: drop livekit token host alias 2026-01-17 19:12:00 -03:00
96b93a1687 comms: use sh for Element host-config script 2026-01-17 18:38:36 -03:00
578ef5e830 comms: add Element host-config entrypoint script 2026-01-17 18:29:42 -03:00
ebb300b939 comms: mount host-specific Element config file 2026-01-17 18:22:36 -03:00
be10e01c2f comms: serve host-specific Element config alias 2026-01-17 18:16:45 -03:00
5f1b61d25e comms: pin guest rename job to rpi5 nodes 2026-01-17 18:04:53 -03:00
0e3c8ef952 comms: add harbor pull secret to vault serviceaccount 2026-01-17 17:57:57 -03:00
6997d5e202 comms: use guest-tools image for guest rename 2026-01-17 17:51:21 -03:00
f9830c6678 comms: prune stale guests after 14 days 2026-01-17 17:30:07 -03:00
1293ffe0a5 comms: pin mas/synapse host aliases for DNS 2026-01-17 17:21:46 -03:00
69d67b39a5 comms: make guest register server threaded 2026-01-17 16:59:57 -03:00
931e41a76f comms: harden guest register provisioning 2026-01-17 16:51:40 -03:00
f15b80872e comms: add default server name to element config 2026-01-17 16:31:53 -03:00
df3a56656d core: route budget and money to traefik 2026-01-17 08:16:57 -03:00
309931f7a5 finance: run firefly entrypoint after vault env 2026-01-17 08:12:14 -03:00
6cf46cf789 core: point internal dns at traefik service 2026-01-17 08:05:33 -03:00
16b7fcd120 finance: let firefly init nginx config 2026-01-17 07:54:27 -03:00
8192dfeebe platform: restore cert-manager and encrypt budget storage 2026-01-17 07:38:38 -03:00
71bab17665 comms: fix matrix login routing and prune guests 2026-01-17 07:32:57 -03:00
356dba3a33 core: add finance hosts to coredns 2026-01-17 06:56:45 -03:00
268a1d9449 sso: retry mas secret lookup 2026-01-17 03:29:36 -03:00
acfab6a150 sso: retry keycloak secret jobs 2026-01-17 03:24:30 -03:00
728f2cd2ee vault: pin cronjobs to service IP 2026-01-17 03:17:36 -03:00
ef5ac62544 vault: make retry helper resilient 2026-01-17 03:09:33 -03:00
ee622cbb0b finance: source firefly env in shell 2026-01-17 03:03:16 -03:00
a9c2d3c5e8 vault: retry vault cli operations 2026-01-17 03:00:25 -03:00
008130f8d0 finance: roll firefly after secrets 2026-01-17 02:59:38 -03:00
376eae3fa1 finance: migrate actual db before bootstrap 2026-01-17 02:55:20 -03:00
ba546bf63f portal: retry vaultwarden cred sync 2026-01-17 02:54:38 -03:00
84fa9e7dbc finance: prepare actual data dirs 2026-01-17 02:50:11 -03:00
9a3c3a3d3e vault: retry status checks in config jobs 2026-01-17 02:49:25 -03:00
36d0df817a finance: roll actual bootstrap 2026-01-17 02:46:16 -03:00
cee565892b finance: harden actual openid bootstrap 2026-01-17 02:43:25 -03:00
b0ac30e719 comms: retry mas local users and rerun 2026-01-17 02:43:15 -03:00
343165b2fa finance: drop dependency gating 2026-01-17 02:39:11 -03:00
3cf34b53e9 finance: bump actual server image 2026-01-17 02:36:08 -03:00
c5b8396bd8 comms: retry mas jobs and rerun 2026-01-17 02:34:36 -03:00
6028d82aa3 finance: expand actual openid env 2026-01-17 02:29:47 -03:00
1cc1b9bea5 comms: rerun mas-dependent jobs 2026-01-17 02:28:21 -03:00
3274b9257c comms: restart mas after db sync 2026-01-17 02:24:50 -03:00
1a3d35094e finance: switch vault seed to python 2026-01-17 02:22:59 -03:00
9047dfa3b5 finance: rerun secrets seed job 2026-01-17 02:17:29 -03:00
9dd2a72063 mailu: retry sync and rerun job 2026-01-17 02:16:13 -03:00
9eedcad520 finance: ensure vault init ordering 2026-01-17 02:10:28 -03:00
64d0a70191 finance: decouple from mailu readiness 2026-01-17 02:06:55 -03:00
cd60ebc982 mailu: bump sync job 2026-01-17 02:01:53 -03:00
928b2a8706 comms: bump mas admin secret job 2026-01-17 02:00:14 -03:00
7b009caf97 keycloak: bump portal admin secret job 2026-01-17 01:54:15 -03:00
86ea701ff0 jobs: bump names after affinity update 2026-01-17 01:52:16 -03:00
6ec0414fcd jobs: prefer arm64 workers 2026-01-17 01:47:53 -03:00
33e35193fb sso: harden keycloak jobs and rerun 2026-01-17 01:41:39 -03:00
1b4f46bb41 sso: rerun realm settings and vault oidc job 2026-01-17 01:36:48 -03:00
5eff31595e maintenance: add k3s agent restart daemonset 2026-01-17 01:28:13 -03:00
622c7acaa4 jobs: rerun keycloak realm + mas db ensure 2026-01-17 01:11:45 -03:00
8f990031f1 finance: fix vault seed job 2026-01-17 01:07:46 -03:00
a9351bc737 jobs: drop apk installs and prefer arm64 2026-01-17 01:02:58 -03:00
f4c6827c8c keycloak: bump realm settings job 2026-01-17 01:00:12 -03:00
62fa6ef371 finance: seed vault secrets 2026-01-17 00:54:49 -03:00
3e3061fe5b finance: add actual budget and firefly 2026-01-16 23:52:56 -03:00
354a803ff4 core: fix coredns tag 2026-01-16 23:27:04 -03:00
368dd81c5e core: use harbor coredns image 2026-01-16 23:25:28 -03:00
e1bd962956 core: manage coredns deployment 2026-01-16 23:16:04 -03:00
d9fabbf353 core: scale coredns replicas 2026-01-16 23:12:56 -03:00
55992ea48f longhorn: make settings job idempotent 2026-01-16 20:15:33 -03:00
42e987f4ee longhorn: apply settings via api job 2026-01-16 20:11:22 -03:00
71a1a55a01 longhorn: ensure settings via job 2026-01-16 20:05:36 -03:00
f8ffa830b7 longhorn: move images to infra project 2026-01-16 20:00:17 -03:00
8535d50faa longhorn: force image pulls during migration 2026-01-16 18:26:29 -03:00
dc62b4998b cert-manager: pin webhook and cainjector to rpi nodes 2026-01-16 18:17:40 -03:00
2f176d5a36 planka: allow project creation for all users 2026-01-16 17:58:20 -03:00
1fb7b27de4 keycloak: rerun realm and user overrides 2026-01-16 17:47:34 -03:00
b07f32e7c8 longhorn: pin vault sync to rpi workers 2026-01-16 17:45:29 -03:00
d9d31f7701 longhorn: allow kustomization to apply without waiting 2026-01-16 17:39:37 -03:00
1eb7d58259 keycloak: enforce bstein group membership 2026-01-16 17:36:07 -03:00
401df4d68c longhorn: use harbor mirrors and vault pull secret 2026-01-16 17:31:29 -03:00
4406724da5 longhorn: add helm repo and adopt workflow 2026-01-16 16:25:40 -03:00
7c3006736c traefik: add CRDs 2026-01-16 11:21:58 -03:00
9f3d2db63d platform: add cert-manager and align postgres vault path 2026-01-16 11:14:48 -03:00
beb646f78f jellyfin: move cache to emptyDir 2026-01-16 09:43:01 -03:00
4faa039a8e maintenance: avoid blocking on k3s traefik cleanup 2026-01-16 09:38:14 -03:00
ef504eea80 maintenance: allow traefik cleanup watch 2026-01-16 09:33:11 -03:00
671d4d5dce maintenance: cleanup k3s traefik and wger attrs 2026-01-16 09:27:22 -03:00
9474ab97f2 maintenance: disable k3s traefik; keycloak portal admin roles 2026-01-16 07:53:04 -03:00
cf5d7dfa00 jellyfin: set traefik tls annotations 2026-01-16 04:01:27 -03:00
5cd196e043 vault/keycloak: restore kv access and wger sync rbac 2026-01-16 03:46:07 -03:00
8ad9f0a664 vault: allow admin kv browse 2026-01-16 03:20:32 -03:00
f5231d282b vault: allow UI mount listing for admins 2026-01-16 02:06:31 -03:00
bb1bf3c017 fix ingress tls routing 2026-01-16 01:40:50 -03:00
b1489a8dd9 fix logging pipeline secret and scheduling 2026-01-16 00:15:58 -03:00
5816d4f399 comms: fix mas vault file paths 2026-01-15 23:56:32 -03:00
d90950b82e gitea: expose ssh via metallb shared IP 2026-01-15 16:39:04 -03:00
66e7e6acc5 core: add bstein.dev coredns overrides 2026-01-15 16:29:32 -03:00
7817248eb9 traefik: wire LB service to custom deployment 2026-01-15 11:26:46 -03:00
9993b501a6 logging: disable wait for data-prepper helmrelease 2026-01-15 04:47:07 -03:00
a2b2c7db9d keycloak: align smtp probe user 2026-01-15 04:44:35 -03:00
8db4b4f0b5 keycloak: rerun execute-actions email e2e 2026-01-15 04:37:12 -03:00
70a52dec06 bstein-dev-home: rerun onboarding e2e job 2026-01-15 04:35:06 -03:00
c759fb1dbb logging: fix data-prepper post-render patch 2026-01-15 04:27:25 -03:00
c0d0e64bc6 keycloak: rerun realm smtp config 2026-01-15 04:24:16 -03:00
5899c9acb3 vault: allow admin policy to update shared secrets 2026-01-15 04:17:14 -03:00
de6665c450 smtp: use mail.bstein.dev for app relays 2026-01-15 04:04:50 -03:00
e6210644c2 smtp: point services at mailu relay 2026-01-15 03:58:03 -03:00
c30f1fc587 vault: allow sso role to read portal admin secret 2026-01-15 03:46:58 -03:00
bf9a24681c fix: bump keycloak and portal e2e job names 2026-01-15 03:44:27 -03:00
69cee91dda vault: fix data-prepper pipeline and portal admin secret job 2026-01-15 03:42:57 -03:00
2ccc33b105 logging: patch data-prepper volume via json 2026-01-15 03:30:16 -03:00
760c9cbe6b logging: drop namespace from data-prepper patch 2026-01-15 03:27:36 -03:00
76151a082c logging: simplify data-prepper patch 2026-01-15 03:25:33 -03:00
c7fa52ab27 logging: use strategic patch for pipeline volume 2026-01-15 03:23:42 -03:00
88f862e18a logging: switch data-prepper volume to configmap 2026-01-15 03:17:07 -03:00
4dba510d6f logging: replace pipeline volume with configmap 2026-01-15 03:14:07 -03:00
9a9ecc4903 logging: patch data-prepper volume to configmap 2026-01-15 03:12:13 -03:00
a7998fc0bf bstein-dev-home: restore image automation setters 2026-01-15 03:11:57 -03:00
72d49f88fe nextcloud: fix cronjob shell flags 2026-01-15 03:08:01 -03:00
fb992f0cff logging: move data-prepper pipeline to configmap 2026-01-15 02:59:21 -03:00
53da4c20ab keycloak: stop writing oauth2-proxy secret 2026-01-15 02:37:04 -03:00
f9fa6dcbb4 crypto: drop wallet rpc bootstrap job 2026-01-15 02:31:31 -03:00
2ecd274f28 crypto: fix wallet rpc image 2026-01-15 02:26:54 -03:00
feb9d6997c vault: prepopulate oidc job 2026-01-15 02:22:52 -03:00
9e6673d02e vault: default oidc claims type 2026-01-15 02:20:53 -03:00
d69545cdb5 vault: harden oidc claims type 2026-01-15 02:18:50 -03:00
756a1af2e6 vault: allow oidc tuning 2026-01-15 02:16:55 -03:00
74a2b3e28d vault: use static token reviewer 2026-01-15 02:14:08 -03:00
84ccf35c44 flux: auto-update portal images on feature branch 2026-01-15 02:12:52 -03:00
e885c7d6ce vault: allow vault-admin token review 2026-01-15 02:09:34 -03:00
86c9951cc4 vault: add admin role for config jobs 2026-01-15 02:06:28 -03:00
85c3d9c2f7 vault: finalize sidecar migration 2026-01-15 01:52:24 -03:00
cd14e70d02 health: run wger sync with python3 2026-01-15 01:13:42 -03:00
f5a3894c2b mailu: use vault sidecar env 2026-01-15 01:02:41 -03:00
511403c4a6 bstein-dev-home: bump portal images 2026-01-15 00:47:51 -03:00
8fed4a08c5 health: allow portal wger sync 2026-01-15 00:41:28 -03:00
7f96daa7b8 comms: move synapse secrets to vault 2026-01-15 00:35:41 -03:00
139ca78c3d bstein-dev-home: bump portal images 2026-01-15 00:28:15 -03:00
836ce605b6 jellyfin: prefer gpu nodes by hostname 2026-01-14 23:56:02 -03:00
88be97d860 health: add nginx main config 2026-01-14 23:55:50 -03:00
35dcc5d66c health: run nginx directly 2026-01-14 23:47:23 -03:00
c1b771298a jellyfin: schedule on nvidia accelerators 2026-01-14 23:37:06 -03:00
e94ea272ce health: fix nginx pid path 2026-01-14 23:35:07 -03:00
81e79fd19a jellyfin: trim vault ldap template 2026-01-14 23:34:39 -03:00
3af97973e0 health: stabilize wger startup 2026-01-14 23:26:07 -03:00
0733127039 vault: sync oidc and wger env 2026-01-14 23:21:39 -03:00
82090c1953 vault: read oidc config from vault 2026-01-14 23:20:04 -03:00
6c8d3b24f2 jellyfin: read LDAP config from vault 2026-01-14 23:15:19 -03:00
d898c71c08 comms: mount synapse signing key 2026-01-14 22:59:11 -03:00
52cc04dee9 comms: mount vault signing key volume 2026-01-14 22:56:30 -03:00
98cdafb162 comms: keep redis env while injecting vault 2026-01-14 22:43:50 -03:00
0b21c8f40d vault: fix hyphenated key templates 2026-01-14 22:37:18 -03:00
e8d004c1b9 comms: fix synapse vault patch 2026-01-14 22:34:02 -03:00
c38f77302f vault: inject comms and grafana secrets 2026-01-14 22:29:27 -03:00
4bb6c7e212 health: fix wger env template newlines 2026-01-14 22:23:48 -03:00
e391a78f25 health: avoid surge rollout for wger 2026-01-14 22:16:36 -03:00
349a6cca3b health: load wger secrets without shell expansion 2026-01-14 22:11:55 -03:00
71f533ca1f harbor: fix vault env templates 2026-01-14 22:07:51 -03:00
9652d9d3cf health: escape wger env vars and fix nginx temp paths 2026-01-14 22:03:40 -03:00
22e3004b0a harbor: preserve required volume mounts 2026-01-14 21:29:40 -03:00
9743064ad3 vault: keep copy loop from clobbering args 2026-01-14 21:24:16 -03:00
8a750ac3ab harbor: fix vault secretKey file path 2026-01-14 21:17:05 -03:00
eeeb69fb7a harbor: mount vault entrypoint script 2026-01-14 21:02:50 -03:00
713fedfe73 harbor: move secrets to vault sidecars 2026-01-14 20:46:46 -03:00
c98d24e91e jenkins: load vault env via env 2026-01-14 17:57:10 -03:00
4ff2f3e889 jenkins: escape vault env values 2026-01-14 17:53:09 -03:00
bb9a4e6d8b longhorn: read oauth2-proxy secrets from vault 2026-01-14 17:48:12 -03:00
fb671865e5 vault: inject remaining services with wrappers 2026-01-14 17:29:09 -03:00
fb9578b624 vault: inject monitoring exporter and health jobs 2026-01-14 14:49:41 -03:00
4f1fb62ab3 vault: bump job names for injector 2026-01-14 14:33:57 -03:00
98d67293bc vault: prepopulate injector for jobs 2026-01-14 14:29:29 -03:00
f6fc250fe1 comms: add vault-secrets emptyDir for mas 2026-01-14 14:24:55 -03:00
393916ded9 comms: shorten vault inject file names 2026-01-14 14:21:58 -03:00
e92cfa7dba vault: move comms and mailu workloads to injector 2026-01-14 14:17:26 -03:00
d559aeb464 keycloak: schedule on arm64 workers 2026-01-14 13:49:37 -03:00
6ba509dbe1 gitea: tolerate oidc init failures 2026-01-14 13:46:34 -03:00
ab50780f49 gitea: trim vault secret newlines 2026-01-14 13:43:56 -03:00
9c16d0fbc0 keycloak: bump job names 2026-01-14 13:42:08 -03:00
89f4b0dbdf vault: stabilize injector templates and add health apps 2026-01-14 13:40:29 -03:00
58c880d9ce keycloak: switch jobs to vault injector 2026-01-14 13:20:57 -03:00
92fbde08eb nextcloud: fix vault template keys 2026-01-14 13:00:21 -03:00
0aa16757e9 gitea: run vault init first 2026-01-14 12:44:49 -03:00
36fb225cbd bstein-dev-home: bump onboarding job 2026-01-14 12:34:02 -03:00
16c62d5a4a vault: move core apps to injector 2026-01-14 12:28:10 -03:00
1add32e683 infra: add vault injector 2026-01-14 11:46:13 -03:00
b1f9df4d83 vault: sync harbor pulls 2026-01-14 10:07:31 -03:00
b8e50bb0a6 monitoring: move grafana smtp to vault 2026-01-14 06:41:34 -03:00
37302664c2 vault: add remaining secret syncs 2026-01-14 06:16:42 -03:00
5683b3f941 jobs: bump names after vault tweaks 2026-01-14 05:47:21 -03:00
9ec08e1dc2 jobs: drop apk in kubectl image 2026-01-14 05:41:01 -03:00
6898641b0a comms: restore livekit token env 2026-01-14 05:35:51 -03:00
35369d53d8 jobs: bump names for immutability 2026-01-14 05:32:07 -03:00
96a7c67674 mailu: bump sync job name 2026-01-14 05:11:27 -03:00
de3db3133b vault(consumption): sync secrets via CSI 2026-01-14 05:07:23 -03:00
8d526e383f vault: send oidc role payload as json 2026-01-14 03:45:03 -03:00
bb2a3ba904 fix(gitea): inline vault secrets 2026-01-14 03:11:53 -03:00
3384533acd fix: resolve gitea mounts and bump portal job 2026-01-14 03:00:10 -03:00
4111fb079f vault: write bound_claims as file 2026-01-14 02:56:29 -03:00
fd2ae6bdd5 vault: wire more services to CSI 2026-01-14 02:54:59 -03:00
8a358832f3 vault: fix oidc scopes parsing 2026-01-14 02:52:51 -03:00
c3541b72c3 vault: run oidc config with sh 2026-01-14 02:28:38 -03:00
55234f8536 vault: align oidc roles with keycloak 2026-01-14 02:24:32 -03:00
50aec198a4 fix: detect vault initialized state correctly 2026-01-14 01:42:28 -03:00
cb5796cb71 fix: make vault k8s auth script posix 2026-01-14 01:38:27 -03:00
5a9ceeab24 fix: run vault k8s auth config with sh 2026-01-14 01:35:06 -03:00
b82195f2d7 feat: start vault consumption for outline and planka 2026-01-14 01:30:41 -03:00
1d894ea80f keycloak: fix harbor oidc job 2026-01-14 01:24:18 -03:00
537d304b36 keycloak: bump harbor oidc job 2026-01-14 01:22:30 -03:00
e776f004c9 keycloak: ensure harbor oidc scope 2026-01-14 01:21:08 -03:00
8fa38268d9 chore: refresh knowledge catalog headers 2026-01-14 01:08:05 -03:00
4a1c4766b8 feat: add harbor/vault oidc automation 2026-01-14 01:07:47 -03:00
bcc15c3e0a monitoring: allow grafana upgrade remediation 2026-01-13 21:18:42 -03:00
0b5dcde3a3 monitoring: align victoria-metrics PVC size 2026-01-13 21:15:10 -03:00
46777f9ec9 comms: restart atlasbot after MAS fixes 2026-01-13 21:09:41 -03:00
98554e5fa4 comms: rerun mas local user seed 2026-01-13 21:06:45 -03:00
b97146f4d1 comms: disable synapse oidc with MAS 2026-01-13 21:04:29 -03:00
928b9379d8 comms: disable synapse password auth with MAS 2026-01-13 21:02:19 -03:00
b710f45e5c comms: fix synapse runtime config injection 2026-01-13 20:59:35 -03:00
e6a3ae5f7b comms: restore MAS and OIDC secrets in synapse 2026-01-13 20:55:36 -03:00
71fd00d845 comms: fix signing key job permissions 2026-01-13 20:49:11 -03:00
fa8ec588a8 comms: add debug logging for signing key job 2026-01-13 20:47:54 -03:00
47f0d1736e comms: retry synapse signing key job 2026-01-13 20:45:14 -03:00
098a06e723 comms: seed synapse signing key for helm 2026-01-13 20:42:30 -03:00
bcef167b50 harbor: enable keycloak oidc settings 2026-01-13 20:42:26 -03:00
fbde129d4c fix(bstein-dev-home): drop invalid image overrides 2026-01-13 20:27:50 -03:00
4332ded0c3 comms: drop legacy synapse configmaps 2026-01-13 20:07:51 -03:00
bbe5ded0a6 comms: bump ensure job names for new images 2026-01-13 20:03:11 -03:00
4602656578 vault: prep helm releases and image pins 2026-01-13 19:29:14 -03:00
8ee7d046d2 ops: prepare vault-consumption branch 2026-01-13 19:01:07 -03:00
b7798db4f1 flux: track main 2026-01-13 17:57:10 -03:00
c2bc8a9512 Merge branch 'feature/postgres-migration' 2026-01-13 17:53:25 -03:00
07fde43749 platform: move postgres to infrastructure 2026-01-13 17:53:04 -03:00
730b9775a3 Merge pull request 'feature/sso-hardening' (#9) from feature/sso-hardening into main
Reviewed-on: #9
2026-01-13 20:23:24 +00:00
407 changed files with 33278 additions and 7816 deletions

2
.gitignore vendored
View File

@ -6,3 +6,5 @@ __pycache__/
*.py[cod]
.pytest_cache
.venv
.venv-ci
tmp/

77
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,77 @@
// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
pipeline {
agent {
kubernetes {
defaultContainer 'python'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
command:
- cat
tty: true
"""
}
}
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Glue tests') {
steps {
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}
}
}

76
ci/Jenkinsfile.titan-iac Normal file
View File

@ -0,0 +1,76 @@
pipeline {
agent {
kubernetes {
defaultContainer 'python'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: python
image: python:3.12-slim
command:
- cat
tty: true
"""
}
}
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Glue tests') {
steps {
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
steps {
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
}
echo "Flux branch: ${env.FLUX_BRANCH}"
}
}
}
stage('Promote') {
when {
expression {
def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
}
}
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}
}
}

4
ci/requirements.txt Normal file
View File

@ -0,0 +1,4 @@
pytest==8.3.4
kubernetes==30.1.0
PyYAML==6.0.2
requests==2.32.3

16
ci/tests/glue/config.yaml Normal file
View File

@ -0,0 +1,16 @@
max_success_age_hours: 48
allow_suspended:
- bstein-dev-home/vaultwarden-cred-sync
- comms/othrys-room-reset
- comms/pin-othrys-invite
- comms/seed-othrys-room
- finance/firefly-user-sync
- health/wger-admin-ensure
- health/wger-user-sync
- mailu-mailserver/mailu-sync-nightly
- nextcloud/nextcloud-mail-sync
ariadne_schedule_tasks:
- schedule.mailu_sync
- schedule.nextcloud_sync
- schedule.vaultwarden_sync
- schedule.wger_admin

View File

@ -0,0 +1,46 @@
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
import yaml
from kubernetes import client, config
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _load_kube():
try:
config.load_incluster_config()
except config.ConfigException:
config.load_kube_config()
def test_glue_cronjobs_recent_success():
cfg = _load_config()
max_age_hours = int(cfg.get("max_success_age_hours", 48))
allow_suspended = set(cfg.get("allow_suspended", []))
_load_kube()
batch = client.BatchV1Api()
cronjobs = batch.list_cron_job_for_all_namespaces(label_selector="atlas.bstein.dev/glue=true").items
assert cronjobs, "No glue cronjobs found with atlas.bstein.dev/glue=true"
now = datetime.now(timezone.utc)
for cronjob in cronjobs:
name = f"{cronjob.metadata.namespace}/{cronjob.metadata.name}"
if cronjob.spec.suspend:
assert name in allow_suspended, f"{name} is suspended but not in allow_suspended"
continue
last_success = cronjob.status.last_successful_time
assert last_success is not None, f"{name} has no lastSuccessfulTime"
age_hours = (now - last_success).total_seconds() / 3600
assert age_hours <= max_age_hours, f"{name} last success {age_hours:.1f}h ago"

View File

@ -0,0 +1,48 @@
from __future__ import annotations
import os
from pathlib import Path
import requests
import yaml
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
response = requests.get(f"{VM_URL}/api/v1/query", params={"query": promql}, timeout=10)
response.raise_for_status()
payload = response.json()
return payload.get("data", {}).get("result", [])
def test_glue_metrics_present():
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
assert series, "No glue cronjob label series found"
def test_glue_metrics_success_join():
query = (
"kube_cronjob_status_last_successful_time "
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def test_ariadne_schedule_metrics_present():
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"

View File

@ -1,13 +0,0 @@
# clusters/atlas/applications/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../services/crypto
- ../../services/gitea
- ../../services/jellyfin
- ../../services/comms
- ../../services/monitoring
- ../../services/logging
- ../../services/pegasus
- ../../services/vault
- ../../services/bstein-dev-home

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: bstein-dev-home-migrations
namespace: flux-system
spec:
interval: 10m
path: ./services/bstein-dev-home/oneoffs/migrations
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: bstein-dev-home
wait: false
suspend: true

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: bstein-dev-home
namespace: flux-system
namespace: bstein-dev-home
spec:
interval: 1m0s
sourceRef:
@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: main
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: main
branch: feature/ariadne
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -1,4 +1,4 @@
# clusters/atlas/flux-system/applications/communication/kustomization.yaml
# clusters/atlas/flux-system/applications/comms/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:

View File

@ -0,0 +1,24 @@
# clusters/atlas/flux-system/applications/finance/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: finance
namespace: flux-system
spec:
interval: 10m
path: ./services/finance
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: finance
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: actual-budget
namespace: finance
- apiVersion: apps/v1
kind: Deployment
name: firefly
namespace: finance
wait: false

View File

@ -13,11 +13,6 @@ spec:
kind: GitRepository
name: flux-system
namespace: flux-system
healthChecks:
- apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
name: harbor
namespace: harbor
wait: false
dependsOn:
- name: core

View File

@ -0,0 +1,25 @@
# clusters/atlas/flux-system/applications/health/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: health
namespace: flux-system
spec:
interval: 10m
path: ./services/health
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: health
dependsOn:
- name: keycloak
- name: postgres
- name: traefik
- name: vault
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: wger
namespace: health
wait: false

View File

@ -12,10 +12,12 @@ resources:
- pegasus/image-automation.yaml
- bstein-dev-home/kustomization.yaml
- bstein-dev-home/image-automation.yaml
- bstein-dev-home-migrations/kustomization.yaml
- harbor/kustomization.yaml
- harbor/image-automation.yaml
- jellyfin/kustomization.yaml
- xmr-miner/kustomization.yaml
- wallet-monero-temp/kustomization.yaml
- sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml
@ -25,6 +27,7 @@ resources:
- ai-llm/kustomization.yaml
- nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml
- postgres/kustomization.yaml
- outline/kustomization.yaml
- planka/kustomization.yaml
- finance/kustomization.yaml
- health/kustomization.yaml

View File

@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: pegasus
namespace: flux-system
namespace: jellyfin
spec:
interval: 1m0s
sourceRef:

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: wallet-monero-temp
namespace: flux-system
spec:
interval: 10m
path: ./services/crypto/wallet-monero-temp
targetNamespace: crypto
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: crypto
- name: xmr-miner
wait: true

View File

@ -1,3 +1,4 @@
# clusters/atlas/flux-system/gotk-components.yaml
---
# This manifest was generated by flux. DO NOT EDIT.
# Flux Version: v2.7.5

View File

@ -1,3 +1,4 @@
# clusters/atlas/flux-system/gotk-sync.yaml
# This manifest was generated by flux. DO NOT EDIT.
---
apiVersion: source.toolkit.fluxcd.io/v1
@ -8,7 +9,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: main
branch: feature/ariadne
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: cert-manager-cleanup
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/cert-manager/cleanup
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: cert-manager
wait: true

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: cert-manager
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/cert-manager
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: cert-manager
dependsOn:
- name: helm
wait: true

View File

@ -4,11 +4,17 @@ kind: Kustomization
resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- cert-manager/kustomization.yaml
- metallb/kustomization.yaml
- traefik/kustomization.yaml
- gitops-ui/kustomization.yaml
- monitoring/kustomization.yaml
- logging/kustomization.yaml
- maintenance/kustomization.yaml
- maintenance/image-automation.yaml
- longhorn-adopt/kustomization.yaml
- longhorn/kustomization.yaml
- longhorn-ui/kustomization.yaml
- postgres/kustomization.yaml
- ../platform/vault-csi/kustomization.yaml
- ../platform/vault-injector/kustomization.yaml

View File

@ -0,0 +1,17 @@
# clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: longhorn-adopt
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/longhorn/adopt
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: longhorn-system
wait: true

View File

@ -15,4 +15,5 @@ spec:
namespace: flux-system
dependsOn:
- name: core
- name: longhorn
wait: true

View File

@ -0,0 +1,20 @@
# clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: longhorn
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/longhorn/core
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: longhorn-system
dependsOn:
- name: helm
- name: longhorn-adopt
wait: false

View File

@ -0,0 +1,26 @@
# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: maintenance
namespace: maintenance
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: feature/ariadne
update:
strategy: Setters
path: services/maintenance

View File

@ -8,6 +8,7 @@ spec:
interval: 10m
path: ./services/maintenance
prune: true
force: true
sourceRef:
kind: GitRepository
name: flux-system

View File

@ -1,4 +1,4 @@
# clusters/atlas/flux-system/applications/postgres/kustomization.yaml
# clusters/atlas/flux-system/platform/postgres/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
@ -6,7 +6,7 @@ metadata:
namespace: flux-system
spec:
interval: 10m
path: ./services/postgres
path: ./infrastructure/postgres
prune: true
force: true
sourceRef:

View File

@ -0,0 +1,16 @@
# clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: vault-injector
namespace: flux-system
spec:
interval: 30m
path: ./infrastructure/vault-injector
targetNamespace: vault
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true

View File

@ -1,8 +0,0 @@
# clusters/atlas/platform/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../../infrastructure/modules/base
- ../../../infrastructure/modules/profiles/atlas-ha
- ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
- ../../../infrastructure/metallb

View File

@ -0,0 +1,5 @@
FROM python:3.11-slim
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN pip install --no-cache-dir requests psycopg2-binary

View File

@ -0,0 +1,9 @@
FROM registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/harbor/entrypoint.sh"]

View File

@ -0,0 +1,9 @@
FROM registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/harbor/entrypoint.sh"]

View File

@ -0,0 +1,9 @@
FROM registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/home/harbor/entrypoint.sh"]

View File

@ -0,0 +1,9 @@
FROM registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
USER root
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER harbor
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/home/harbor/start.sh"]

View File

@ -0,0 +1,10 @@
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /lk-jwt-service /lk-jwt-service
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/lk-jwt-service"]

View File

@ -0,0 +1,10 @@
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/bin/oauth2-proxy"]

View File

@ -0,0 +1,10 @@
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY --from=base /pegasus /pegasus
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/pegasus"]

View File

@ -0,0 +1,34 @@
#!/bin/sh
set -eu
if [ -n "${VAULT_ENV_FILE:-}" ]; then
if [ -f "${VAULT_ENV_FILE}" ]; then
# shellcheck disable=SC1090
. "${VAULT_ENV_FILE}"
else
echo "Vault env file not found: ${VAULT_ENV_FILE}" >&2
exit 1
fi
fi
if [ -n "${VAULT_COPY_FILES:-}" ]; then
old_ifs="$IFS"
IFS=','
for pair in ${VAULT_COPY_FILES}; do
src="${pair%%:*}"
dest="${pair#*:}"
if [ -z "${src}" ] || [ -z "${dest}" ]; then
echo "Vault copy entry malformed: ${pair}" >&2
exit 1
fi
if [ ! -f "${src}" ]; then
echo "Vault file not found: ${src}" >&2
exit 1
fi
mkdir -p "$(dirname "${dest}")"
cp "${src}" "${dest}"
done
IFS="$old_ifs"
fi
exec "$@"

View File

@ -0,0 +1,40 @@
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: cert-manager-cleanup-2
namespace: cert-manager
spec:
backoffLimit: 1
template:
spec:
serviceAccountName: cert-manager-cleanup
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: cleanup
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/cert_manager_cleanup.sh"]
volumeMounts:
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: script
configMap:
name: cert-manager-cleanup-script
defaultMode: 0555

View File

@ -0,0 +1,58 @@
# infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: cert-manager-cleanup
namespace: cert-manager
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cert-manager-cleanup
rules:
- apiGroups: [""]
resources:
- pods
- services
- endpoints
- configmaps
- secrets
- serviceaccounts
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["apps"]
resources:
- deployments
- daemonsets
- statefulsets
- replicasets
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["batch"]
resources:
- jobs
- cronjobs
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources:
- roles
- rolebindings
- clusterroles
- clusterrolebindings
verbs: ["get", "list", "watch", "delete"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- validatingwebhookconfigurations
- mutatingwebhookconfigurations
verbs: ["get", "list", "watch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cert-manager-cleanup
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cert-manager-cleanup
subjects:
- kind: ServiceAccount
name: cert-manager-cleanup
namespace: cert-manager

View File

@ -0,0 +1,15 @@
# infrastructure/cert-manager/cleanup/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- cert-manager-cleanup-rbac.yaml
- cert-manager-cleanup-job.yaml
configMapGenerator:
- name: cert-manager-cleanup-script
namespace: cert-manager
files:
- cert_manager_cleanup.sh=scripts/cert_manager_cleanup.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,5 @@
# infrastructure/cert-manager/cleanup/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
namespace="cert-manager"
selectors=(
"app.kubernetes.io/name=cert-manager"
"app.kubernetes.io/instance=cert-manager"
"app.kubernetes.io/instance=certmanager-prod"
)
delete_namespaced() {
local selector="$1"
kubectl -n "${namespace}" delete deployment,daemonset,statefulset,replicaset \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete pod,service,endpoints,serviceaccount,configmap,secret \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete role,rolebinding \
--selector "${selector}" --ignore-not-found --wait=false
kubectl -n "${namespace}" delete job,cronjob \
--selector "${selector}" --ignore-not-found --wait=false
}
delete_cluster_scoped() {
local selector="$1"
kubectl delete clusterrole,clusterrolebinding \
--selector "${selector}" --ignore-not-found --wait=false
kubectl delete mutatingwebhookconfiguration,validatingwebhookconfiguration \
--selector "${selector}" --ignore-not-found --wait=false
}
for selector in "${selectors[@]}"; do
delete_namespaced "${selector}"
delete_cluster_scoped "${selector}"
done
kubectl delete mutatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
kubectl delete validatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false

View File

@ -0,0 +1,67 @@
# infrastructure/cert-manager/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: cert-manager
namespace: cert-manager
spec:
interval: 30m
chart:
spec:
chart: cert-manager
version: v1.17.0
sourceRef:
kind: HelmRepository
name: jetstack
namespace: flux-system
install:
crds: CreateReplace
remediation: { retries: 3 }
timeout: 10m
upgrade:
crds: CreateReplace
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
installCRDs: true
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
webhook:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
cainjector:
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4

View File

@ -0,0 +1,6 @@
# infrastructure/cert-manager/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- helmrelease.yaml

View File

@ -0,0 +1,5 @@
# infrastructure/cert-manager/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@ -0,0 +1,47 @@
# infrastructure/core/coredns-custom.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns-custom
namespace: kube-system
data:
bstein-dev.server: |
bstein.dev:53 {
errors
cache 30
hosts {
192.168.22.9 alerts.bstein.dev
192.168.22.9 auth.bstein.dev
192.168.22.9 bstein.dev
10.43.6.87 budget.bstein.dev
192.168.22.9 call.live.bstein.dev
192.168.22.9 cd.bstein.dev
192.168.22.9 chat.ai.bstein.dev
192.168.22.9 ci.bstein.dev
192.168.22.9 cloud.bstein.dev
192.168.22.9 health.bstein.dev
192.168.22.9 kit.live.bstein.dev
192.168.22.9 live.bstein.dev
192.168.22.9 logs.bstein.dev
192.168.22.9 longhorn.bstein.dev
192.168.22.4 mail.bstein.dev
192.168.22.9 matrix.live.bstein.dev
192.168.22.9 metrics.bstein.dev
192.168.22.9 monero.bstein.dev
10.43.6.87 money.bstein.dev
192.168.22.9 notes.bstein.dev
192.168.22.9 office.bstein.dev
192.168.22.9 pegasus.bstein.dev
3.136.224.193 pm-bounces.bstein.dev
3.150.68.49 pm-bounces.bstein.dev
18.189.137.81 pm-bounces.bstein.dev
192.168.22.9 registry.bstein.dev
192.168.22.9 scm.bstein.dev
192.168.22.9 secret.bstein.dev
192.168.22.9 sso.bstein.dev
192.168.22.9 stream.bstein.dev
192.168.22.9 tasks.bstein.dev
192.168.22.9 vault.bstein.dev
fallthrough
}
}

View File

@ -0,0 +1,141 @@
# infrastructure/core/coredns-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/name: CoreDNS
spec:
progressDeadlineSeconds: 600
replicas: 2
revisionHistoryLimit: 0
selector:
matchLabels:
k8s-app: kube-dns
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 25%
maxUnavailable: 1
template:
metadata:
labels:
k8s-app: kube-dns
spec:
containers:
- name: coredns
image: registry.bstein.dev/infra/coredns:1.12.1
imagePullPolicy: IfNotPresent
args:
- -conf
- /etc/coredns/Corefile
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
- containerPort: 9153
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 8181
scheme: HTTP
periodSeconds: 2
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
resources:
limits:
memory: 170Mi
requests:
cpu: 100m
memory: 70Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- all
readOnlyRootFilesystem: true
volumeMounts:
- name: config-volume
mountPath: /etc/coredns
readOnly: true
- name: custom-config-volume
mountPath: /etc/coredns/custom
readOnly: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
- key: node-role.kubernetes.io/worker
operator: In
values:
- "true"
dnsPolicy: Default
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
restartPolicy: Always
schedulerName: default-scheduler
serviceAccountName: coredns
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
k8s-app: kube-dns
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
k8s-app: kube-dns
volumes:
- name: config-volume
configMap:
name: coredns
defaultMode: 420
items:
- key: Corefile
path: Corefile
- key: NodeHosts
path: NodeHosts
- name: custom-config-volume
configMap:
name: coredns-custom
optional: true
defaultMode: 420

View File

@ -4,5 +4,8 @@ kind: Kustomization
resources:
- ../modules/base
- ../modules/profiles/atlas-ha
- coredns-custom.yaml
- coredns-deployment.yaml
- ntp-sync-daemonset.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,50 @@
# infrastructure/core/ntp-sync-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ntp-sync
namespace: kube-system
labels:
app: ntp-sync
spec:
selector:
matchLabels:
app: ntp-sync
template:
metadata:
labels:
app: ntp-sync
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: DoesNotExist
- key: node-role.kubernetes.io/master
operator: DoesNotExist
containers:
- name: ntp-sync
image: public.ecr.aws/docker/library/busybox:1.36.1
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
set -eu
while true; do
ntpd -q -p pool.ntp.org || true
sleep 300
done
securityContext:
capabilities:
add: ["SYS_TIME"]
runAsUser: 0
runAsGroup: 0
resources:
requests:
cpu: 10m
memory: 16Mi
limits:
cpu: 50m
memory: 64Mi

View File

@ -0,0 +1,15 @@
# infrastructure/longhorn/adopt/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- longhorn-adopt-rbac.yaml
- longhorn-helm-adopt-job.yaml
configMapGenerator:
- name: longhorn-helm-adopt-script
namespace: longhorn-system
files:
- longhorn_helm_adopt.sh=scripts/longhorn_helm_adopt.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,56 @@
# infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-helm-adopt
namespace: longhorn-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: longhorn-helm-adopt
rules:
- apiGroups: [""]
resources:
- configmaps
- services
- serviceaccounts
- secrets
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["apps"]
resources:
- deployments
- daemonsets
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["batch"]
resources:
- jobs
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources:
- roles
- rolebindings
- clusterroles
- clusterrolebindings
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["apiextensions.k8s.io"]
resources:
- customresourcedefinitions
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["scheduling.k8s.io"]
resources:
- priorityclasses
verbs: ["get", "list", "watch", "patch", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: longhorn-helm-adopt
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: longhorn-helm-adopt
subjects:
- kind: ServiceAccount
name: longhorn-helm-adopt
namespace: longhorn-system

View File

@ -0,0 +1,40 @@
# infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-helm-adopt-2
namespace: longhorn-system
spec:
backoffLimit: 1
template:
spec:
serviceAccountName: longhorn-helm-adopt
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: adopt
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/longhorn_helm_adopt.sh"]
volumeMounts:
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: script
configMap:
name: longhorn-helm-adopt-script
defaultMode: 0555

View File

@ -0,0 +1,5 @@
# infrastructure/longhorn/adopt/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: longhorn-system

View File

@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -euo pipefail
release_name="longhorn"
release_namespace="longhorn-system"
selector="app.kubernetes.io/instance=${release_name}"
annotate_and_label() {
local scope="$1"
local kind="$2"
if [ "${scope}" = "namespaced" ]; then
kubectl -n "${release_namespace}" annotate "${kind}" -l "${selector}" \
meta.helm.sh/release-name="${release_name}" \
meta.helm.sh/release-namespace="${release_namespace}" \
--overwrite >/dev/null 2>&1 || true
kubectl -n "${release_namespace}" label "${kind}" -l "${selector}" \
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
else
kubectl annotate "${kind}" -l "${selector}" \
meta.helm.sh/release-name="${release_name}" \
meta.helm.sh/release-namespace="${release_namespace}" \
--overwrite >/dev/null 2>&1 || true
kubectl label "${kind}" -l "${selector}" \
app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
fi
}
namespaced_kinds=(
configmap
service
serviceaccount
deployment
daemonset
job
role
rolebinding
)
cluster_kinds=(
clusterrole
clusterrolebinding
customresourcedefinition
priorityclass
)
for kind in "${namespaced_kinds[@]}"; do
annotate_and_label "namespaced" "${kind}"
done
for kind in "${cluster_kinds[@]}"; do
annotate_and_label "cluster" "${kind}"
done

View File

@ -0,0 +1,80 @@
# infrastructure/longhorn/core/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: longhorn
namespace: longhorn-system
spec:
interval: 30m
chart:
spec:
chart: longhorn
version: 1.8.2
sourceRef:
kind: HelmRepository
name: longhorn
namespace: flux-system
install:
crds: Skip
remediation: { retries: 3 }
timeout: 15m
upgrade:
crds: Skip
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 15m
values:
service:
ui:
type: NodePort
nodePort: 30824
privateRegistry:
createSecret: false
registrySecret: longhorn-registry
image:
pullPolicy: Always
longhorn:
engine:
repository: registry.bstein.dev/infra/longhorn-engine
tag: v1.8.2
manager:
repository: registry.bstein.dev/infra/longhorn-manager
tag: v1.8.2
ui:
repository: registry.bstein.dev/infra/longhorn-ui
tag: v1.8.2
instanceManager:
repository: registry.bstein.dev/infra/longhorn-instance-manager
tag: v1.8.2
shareManager:
repository: registry.bstein.dev/infra/longhorn-share-manager
tag: v1.8.2
backingImageManager:
repository: registry.bstein.dev/infra/longhorn-backing-image-manager
tag: v1.8.2
supportBundleKit:
repository: registry.bstein.dev/infra/longhorn-support-bundle-kit
tag: v0.0.56
csi:
attacher:
repository: registry.bstein.dev/infra/longhorn-csi-attacher
tag: v4.9.0
provisioner:
repository: registry.bstein.dev/infra/longhorn-csi-provisioner
tag: v5.3.0
nodeDriverRegistrar:
repository: registry.bstein.dev/infra/longhorn-csi-node-driver-registrar
tag: v2.14.0
resizer:
repository: registry.bstein.dev/infra/longhorn-csi-resizer
tag: v1.13.2
snapshotter:
repository: registry.bstein.dev/infra/longhorn-csi-snapshotter
tag: v8.2.0
livenessProbe:
repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always

View File

@ -0,0 +1,18 @@
# infrastructure/longhorn/core/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- vault-serviceaccount.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- helmrelease.yaml
- longhorn-settings-ensure-job.yaml
configMapGenerator:
- name: longhorn-settings-ensure-script
files:
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
generatorOptions:
disableNameSuffixHash: true

View File

@ -0,0 +1,36 @@
# infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-4
namespace: longhorn-system
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
volumes:
- name: longhorn-settings-ensure-script
configMap:
name: longhorn-settings-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/scripts/longhorn_settings_ensure.sh"]
volumeMounts:
- name: longhorn-settings-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -0,0 +1,5 @@
# infrastructure/longhorn/core/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: longhorn-system

View File

@ -0,0 +1,42 @@
#!/usr/bin/env sh
set -eu
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
wait_for_api() {
attempts=30
while [ "${attempts}" -gt 0 ]; do
if curl -fsS "${api_base}" >/dev/null 2>&1; then
return 0
fi
attempts=$((attempts - 1))
sleep 2
done
echo "Longhorn API not ready after retries." >&2
return 1
}
update_setting() {
name="$1"
value="$2"
current="$(curl -fsS "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} already set."
return 0
fi
echo "Setting ${name} -> ${value}"
curl -fsS -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}" >/dev/null
}
wait_for_api
update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v1.8.2"
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"

View File

@ -0,0 +1,21 @@
# infrastructure/longhorn/core/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: longhorn-vault
namespace: longhorn-system
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "longhorn"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -0,0 +1,6 @@
# infrastructure/longhorn/core/vault-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-vault-sync
namespace: longhorn-system

View File

@ -0,0 +1,45 @@
# infrastructure/longhorn/core/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: longhorn-vault-sync
namespace: longhorn-system
spec:
replicas: 1
selector:
matchLabels:
app: longhorn-vault-sync
template:
metadata:
labels:
app: longhorn-vault-sync
spec:
serviceAccountName: longhorn-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 80
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5", "rpi4"]
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: longhorn-vault

View File

@ -2,6 +2,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- serviceaccount.yaml
- oauth2-proxy-longhorn.yaml
- middleware.yaml
- ingress.yaml
- oauth2-proxy-longhorn.yaml

View File

@ -32,7 +32,18 @@ spec:
metadata:
labels:
app: oauth2-proxy-longhorn
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "longhorn"
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/longhorn/oauth2-proxy"
vault.hashicorp.com/agent-inject-template-oidc-config: |
{{- with secret "kv/data/atlas/longhorn/oauth2-proxy" -}}
client_id = "{{ .Data.data.client_id }}"
client_secret = "{{ .Data.data.client_secret }}"
cookie_secret = "{{ .Data.data.cookie_secret }}"
{{- end -}}
spec:
serviceAccountName: longhorn-vault
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
@ -50,6 +61,7 @@ spec:
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --config=/vault/secrets/oidc-config
- --redirect-url=https://longhorn.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
@ -69,22 +81,6 @@ spec:
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev
env:
- name: OAUTH2_PROXY_CLIENT_ID
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_id
- name: OAUTH2_PROXY_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: client_secret
- name: OAUTH2_PROXY_COOKIE_SECRET
valueFrom:
secretKeyRef:
name: oauth2-proxy-longhorn-oidc
key: cookie_secret
ports:
- containerPort: 4180
name: http

View File

@ -0,0 +1,6 @@
# infrastructure/longhorn/ui-ingress/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: longhorn-vault
namespace: longhorn-system

View File

@ -0,0 +1,47 @@
# infrastructure/metallb/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: metallb
namespace: metallb-system
spec:
interval: 30m
chart:
spec:
chart: metallb
version: 0.15.3
sourceRef:
kind: HelmRepository
name: metallb
namespace: flux-system
install:
crds: CreateReplace
remediation: { retries: 3 }
timeout: 10m
upgrade:
crds: CreateReplace
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
loadBalancerClass: metallb
prometheus:
metricsPort: 7472
controller:
logLevel: info
webhookMode: enabled
tlsMinVersion: VersionTLS12
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
speaker:
logLevel: info

View File

@ -3,8 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- metallb-rendered.yaml
- helmrelease.yaml
- ippool.yaml
patchesStrategicMerge:
- patches/node-placement.yaml
- patches/speaker-loglevel.yaml

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +0,0 @@
# infrastructure/metallb/patches/node-placement.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: metallb-controller
namespace: metallb-system
spec:
template:
spec:
containers:
- name: controller
args:
- --port=7472
- --log-level=info
- --webhook-mode=enabled
- --tls-min-version=VersionTLS12
- --lb-class=metallb
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5

View File

@ -1,15 +0,0 @@
# infrastructure/metallb/patches/speaker-loglevel.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: metallb-speaker
namespace: metallb-system
spec:
template:
spec:
containers:
- name: speaker
args:
- --port=7472
- --log-level=info
- --lb-class=metallb

View File

@ -0,0 +1,24 @@
# infrastructure/modules/base/storageclass/asteria-encrypted.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: asteria-encrypted
parameters:
diskSelector: asteria
fromBackup: ""
numberOfReplicas: "2"
staleReplicaTimeout: "30"
fsType: "ext4"
replicaAutoBalance: "least-effort"
dataLocality: "disabled"
encrypted: "true"
csi.storage.k8s.io/provisioner-secret-name: ${pvc.name}
csi.storage.k8s.io/provisioner-secret-namespace: ${pvc.namespace}
csi.storage.k8s.io/node-publish-secret-name: ${pvc.name}
csi.storage.k8s.io/node-publish-secret-namespace: ${pvc.namespace}
csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}
csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace}
provisioner: driver.longhorn.io
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: Immediate

View File

@ -3,4 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- asteria.yaml
- asteria-encrypted.yaml
- astreae.yaml

View File

@ -1,4 +1,4 @@
# services/postgres/kustomization.yaml
# infrastructure/postgres/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: postgres

View File

@ -1,4 +1,4 @@
# services/postgres/namespace.yaml
# infrastructure/postgres/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:

View File

@ -1,4 +1,4 @@
# services/postgres/secretproviderclass.yaml
# infrastructure/postgres/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
@ -11,5 +11,5 @@ spec:
roleName: "postgres"
objects: |
- objectName: "postgres_password"
secretPath: "kv/data/postgres"
secretPath: "kv/data/atlas/postgres/postgres-db"
secretKey: "POSTGRES_PASSWORD"

View File

@ -0,0 +1,23 @@
# infrastructure/postgres/service.yaml
apiVersion: v1
kind: Service
metadata:
name: postgres-service
namespace: postgres
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9187"
prometheus.io/path: "/metrics"
spec:
clusterIP: None
ports:
- name: postgres
port: 5432
protocol: TCP
targetPort: 5432
- name: metrics
port: 9187
protocol: TCP
targetPort: 9187
selector:
app: postgres

View File

@ -1,4 +1,4 @@
# services/postgres/serviceaccount.yaml
# infrastructure/postgres/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:

View File

@ -1,4 +1,4 @@
# services/postgres/statefulset.yaml
# infrastructure/postgres/statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
@ -58,6 +58,23 @@ spec:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
- name: postgres-exporter
image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
ports:
- name: metrics
containerPort: 9187
protocol: TCP
env:
- name: DATA_SOURCE_URI
value: "localhost:5432/postgres?sslmode=disable"
- name: DATA_SOURCE_USER
value: postgres
- name: DATA_SOURCE_PASS_FILE
value: /mnt/vault/postgres_password
volumeMounts:
- name: vault-secrets
mountPath: /mnt/vault
readOnly: true
volumes:
- name: vault-secrets
csi:

View File

@ -1,10 +1,11 @@
# infrastructure/sources/cert-manager/letsencrypt-prod.yaml
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: brad.stein@gmail.com
email: brad@bstein.dev
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-prod-account-key

View File

@ -1,10 +1,11 @@
# infrastructure/sources/cert-manager/letsencrypt.yaml
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt
spec:
acme:
email: brad.stein@gmail.com
email: brad@bstein.dev
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: letsencrypt-account-key

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/ananace.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: ananace
namespace: flux-system
spec:
interval: 1h
url: https://ananace.gitlab.io/charts

View File

@ -2,15 +2,18 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ananace.yaml
- fluent-bit.yaml
- grafana.yaml
- hashicorp.yaml
- jetstack.yaml
- jenkins.yaml
- mailu.yaml
- metallb.yaml
- opentelemetry.yaml
- opensearch.yaml
- harbor.yaml
- longhorn.yaml
- prometheus.yaml
- victoria-metrics.yaml
- secrets-store-csi.yaml

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/longhorn.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: longhorn
namespace: flux-system
spec:
interval: 30m
url: https://charts.longhorn.io

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/metallb.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: metallb
namespace: flux-system
spec:
interval: 1h
url: https://metallb.github.io/metallb

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,8 @@ items:
creationTimestamp: null
labels:
app: traefik
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
spec:
containers:
- args:

View File

@ -5,6 +5,7 @@ metadata:
name: traefik
namespace: flux-system
resources:
- crds.yaml
- deployment.yaml
- serviceaccount.yaml
- clusterrole.yaml

View File

@ -3,9 +3,10 @@ apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: kube-system
namespace: traefik
annotations:
metallb.universe.tf/address-pool: communication-pool
metallb.universe.tf/allow-shared-ip: traefik
spec:
type: LoadBalancer
loadBalancerClass: metallb
@ -20,5 +21,4 @@ spec:
targetPort: websecure
protocol: TCP
selector:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
app: traefik

View File

@ -17,4 +17,5 @@ spec:
values:
syncSecret:
enabled: true
enableSecretRotation: false
enableSecretRotation: true
rotationPollInterval: 2m

View File

@ -0,0 +1,43 @@
# infrastructure/vault-injector/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: vault-injector
namespace: vault
spec:
interval: 30m
chart:
spec:
chart: vault
version: 0.31.0
sourceRef:
kind: HelmRepository
name: hashicorp
namespace: flux-system
install:
remediation: { retries: 3 }
timeout: 10m
upgrade:
remediation:
retries: 3
remediateLastFailure: true
cleanupOnFail: true
timeout: 10m
values:
global:
externalVaultAddr: http://vault.vault.svc.cluster.local:8200
tlsDisable: true
server:
enabled: false
csi:
enabled: false
injector:
enabled: true
replicas: 1
agentImage:
repository: hashicorp/vault
tag: "1.17.6"
webhook:
failurePolicy: Ignore
nodeSelector:
node-role.kubernetes.io/worker: "true"

View File

@ -0,0 +1,5 @@
# infrastructure/vault-injector/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml

View File

@ -1,8 +1,8 @@
{
"counts": {
"helmrelease_host_hints": 7,
"http_endpoints": 35,
"services": 44,
"workloads": 49
"helmrelease_host_hints": 19,
"http_endpoints": 45,
"services": 47,
"workloads": 74
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,11 @@ flowchart LR
host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
host_budget_bstein_dev["budget.bstein.dev"]
svc_finance_actual_budget["finance/actual-budget (Service)"]
host_budget_bstein_dev --> svc_finance_actual_budget
wl_finance_actual_budget["finance/actual-budget (Deployment)"]
svc_finance_actual_budget --> wl_finance_actual_budget
host_call_live_bstein_dev["call.live.bstein.dev"]
svc_comms_element_call["comms/element-call (Service)"]
host_call_live_bstein_dev --> svc_comms_element_call
@ -37,6 +42,11 @@ flowchart LR
host_cloud_bstein_dev --> svc_nextcloud_nextcloud
wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
host_health_bstein_dev["health.bstein.dev"]
svc_health_wger["health/wger (Service)"]
host_health_bstein_dev --> svc_health_wger
wl_health_wger["health/wger (Deployment)"]
svc_health_wger --> wl_health_wger
host_kit_live_bstein_dev["kit.live.bstein.dev"]
svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@ -47,15 +57,22 @@ flowchart LR
wl_comms_livekit["comms/livekit (Deployment)"]
svc_comms_livekit --> wl_comms_livekit
host_live_bstein_dev["live.bstein.dev"]
svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
host_live_bstein_dev --> svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
host_live_bstein_dev --> svc_comms_matrix_wellknown
svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_logs_bstein_dev["logs.bstein.dev"]
svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
host_longhorn_bstein_dev["longhorn.bstein.dev"]
svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
@ -65,21 +82,25 @@ flowchart LR
svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
host_monero_bstein_dev["monero.bstein.dev"]
svc_crypto_monerod["crypto/monerod (Service)"]
host_monero_bstein_dev --> svc_crypto_monerod
wl_crypto_monerod["crypto/monerod (Deployment)"]
svc_crypto_monerod --> wl_crypto_monerod
host_money_bstein_dev["money.bstein.dev"]
svc_finance_firefly["finance/firefly (Service)"]
host_money_bstein_dev --> svc_finance_firefly
wl_finance_firefly["finance/firefly (Deployment)"]
svc_finance_firefly --> wl_finance_firefly
host_notes_bstein_dev["notes.bstein.dev"]
svc_outline_outline["outline/outline (Service)"]
host_notes_bstein_dev --> svc_outline_outline
wl_outline_outline["outline/outline (Deployment)"]
svc_outline_outline --> wl_outline_outline
host_office_bstein_dev["office.bstein.dev"]
svc_nextcloud_collabora["nextcloud/collabora (Service)"]
host_office_bstein_dev --> svc_nextcloud_collabora
@ -110,6 +131,11 @@ flowchart LR
host_stream_bstein_dev --> svc_jellyfin_jellyfin
wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
host_tasks_bstein_dev["tasks.bstein.dev"]
svc_planka_planka["planka/planka (Service)"]
host_tasks_bstein_dev --> svc_planka_planka
wl_planka_planka["planka/planka (Deployment)"]
svc_planka_planka --> wl_planka_planka
host_vault_bstein_dev["vault.bstein.dev"]
svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
@ -133,23 +159,30 @@ flowchart LR
wl_comms_livekit_token_service
svc_comms_livekit
wl_comms_livekit
svc_comms_othrys_element_element_web
wl_comms_othrys_element_element_web
svc_comms_othrys_synapse_matrix_synapse
wl_comms_othrys_synapse_matrix_synapse
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
svc_comms_matrix_guest_register
wl_comms_matrix_guest_register
svc_comms_matrix_authentication_service
wl_comms_matrix_authentication_service
end
subgraph crypto[crypto]
svc_crypto_monerod
wl_crypto_monerod
end
subgraph finance[finance]
svc_finance_actual_budget
wl_finance_actual_budget
svc_finance_firefly
wl_finance_firefly
end
subgraph gitea[gitea]
svc_gitea_gitea
wl_gitea_gitea
end
subgraph health[health]
svc_health_wger
wl_health_wger
end
subgraph jellyfin[jellyfin]
svc_jellyfin_pegasus
wl_jellyfin_pegasus
@ -160,6 +193,10 @@ flowchart LR
svc_jenkins_jenkins
wl_jenkins_jenkins
end
subgraph logging[logging]
svc_logging_oauth2_proxy_logs
wl_logging_oauth2_proxy_logs
end
subgraph longhorn_system[longhorn-system]
svc_longhorn_system_oauth2_proxy_longhorn
wl_longhorn_system_oauth2_proxy_longhorn
@ -173,6 +210,14 @@ flowchart LR
svc_nextcloud_collabora
wl_nextcloud_collabora
end
subgraph outline[outline]
svc_outline_outline
wl_outline_outline
end
subgraph planka[planka]
svc_planka_planka
wl_planka_planka
end
subgraph sso[sso]
svc_sso_oauth2_proxy
wl_sso_oauth2_proxy

View File

@ -70,6 +70,7 @@ WORKER_NODES = [
"titan-13",
"titan-14",
"titan-15",
"titan-16",
"titan-17",
"titan-18",
"titan-19",
@ -85,19 +86,17 @@ WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
# Namespaces considered infrastructure (excluded from workload counts)
INFRA_NAMESPACES = [
"kube-system",
"longhorn-system",
"metallb-system",
INFRA_PATTERNS = [
"kube-.*",
".*-system",
"traefik",
"monitoring",
"logging",
"cert-manager",
"flux-system",
"traefik",
"maintenance",
"postgres",
]
INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
@ -209,7 +208,66 @@ def namespace_ram_raw(scope_var):
def namespace_gpu_usage_instant(scope_var):
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
return gpu_usage_by_namespace(scope_var)
def jetson_gpu_util_by_node():
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
def dcgm_gpu_util_by_node():
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
return (
"avg by (node) ("
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
'kube_pod_info{namespace="monitoring"}'
")"
)
def gpu_util_by_node():
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
def gpu_util_by_hostname():
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
def gpu_requests_by_namespace_node(scope_var):
return (
"sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info "
f"* on(node) group_left() ({gpu_node_labels()})"
")"
)
def gpu_usage_by_namespace(scope_var):
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() ({gpu_util_by_node()})"
")"
)
def jetson_gpu_usage_by_namespace(scope_var):
requests_by_ns = jetson_gpu_requests(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
")"
)
def namespace_share_expr(resource_expr):
@ -229,7 +287,7 @@ def namespace_gpu_share_expr(scope_var):
usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
return f"({share}) or ({idle})"
@ -319,6 +377,76 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
GLUE_STALE_WINDOW_SEC = 36 * 3600
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
ARIADNE_TASK_WARNINGS_SERIES = (
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
)
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
"100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
"/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
)
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
)
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
ONEOFF_JOB_OWNER = (
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
)
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
ONEOFF_JOB_POD_AGE_HOURS = (
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
'* on(namespace,pod) group_left(phase) '
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
)
GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -496,6 +624,7 @@ def timeseries_panel(
grid,
*,
unit="none",
max_value=None,
legend=None,
legend_display="table",
legend_placement="bottom",
@ -520,6 +649,8 @@ def timeseries_panel(
"tooltip": {"mode": "multi"},
},
}
if max_value is not None:
panel["fieldConfig"]["defaults"]["max"] = max_value
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
@ -671,13 +802,22 @@ def bargauge_panel(
grid,
*,
unit="none",
legend=None,
links=None,
limit=None,
sort_order="desc",
thresholds=None,
decimals=None,
instant=False,
overrides=None,
):
"""Return a bar gauge panel with label-aware reduction."""
cleaned_expr = expr.strip()
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
if sort_order == "desc":
expr = f"sort_desc({expr})"
elif sort_order == "asc":
expr = f"sort({expr})"
panel = {
"id": panel_id,
"type": "bargauge",
@ -685,7 +825,12 @@ def bargauge_panel(
"datasource": PROM_DS,
"gridPos": grid,
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
{
"expr": expr,
"refId": "A",
"legendFormat": legend or "{{node}}",
**({"instant": True} if instant else {}),
}
],
"fieldConfig": {
"defaults": {
@ -715,6 +860,8 @@ def bargauge_panel(
},
},
}
if overrides:
panel["fieldConfig"]["overrides"].extend(overrides)
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
@ -723,7 +870,7 @@ def bargauge_panel(
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
"options": {"fields": ["Value"], "order": sort_order},
}
]
if limit:
@ -763,6 +910,15 @@ def build_overview():
{"color": "red", "value": 3},
],
}
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
row1_stats = [
{
@ -965,7 +1121,7 @@ def build_overview():
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 2, "w": 6, "x": 0, "y": 8},
{"h": 3, "w": 4, "x": 0, "y": 8},
unit="none",
links=link_to("atlas-mail"),
)
@ -976,7 +1132,7 @@ def build_overview():
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1022,7 +1178,7 @@ def build_overview():
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 2, "w": 6, "x": 6, "y": 8},
{"h": 3, "w": 4, "x": 4, "y": 8},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
@ -1034,13 +1190,38 @@ def build_overview():
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 2, "w": 6, "x": 18, "y": 8},
{"h": 3, "w": 4, "x": 12, "y": 8},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
34,
"Postgres Connections Used",
POSTGRES_CONN_USED,
{"h": 3, "w": 4, "x": 16, "y": 8},
decimals=0,
text_mode="name_and_value",
legend="{{conn}}",
instant=True,
)
)
panels.append(
stat_panel(
35,
"Postgres Hottest Connections",
POSTGRES_CONN_HOTTEST,
{"h": 3, "w": 4, "x": 20, "y": 8},
unit="none",
decimals=0,
text_mode="name_and_value",
legend="{{datname}}",
instant=True,
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@ -1054,13 +1235,104 @@ def build_overview():
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
40,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 6, "w": 6, "x": 0, "y": 14},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=8,
decimals=2,
)
)
panels.append(
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
timeseries_panel(
42,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
legend=None,
legend_display="list",
)
)
panels.append(
bargauge_panel(
43,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
legend="{{result}}",
overrides=[
{
"matcher": {"id": "byName", "options": "error"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
},
{
"matcher": {"id": "byName", "options": "failed"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
},
)
)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
@ -1070,9 +1342,9 @@ def build_overview():
11,
"Namespace CPU Share",
namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 16},
{"h": 9, "w": 8, "x": 0, "y": 20},
links=namespace_scope_links("namespace_scope_cpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
panels.append(
@ -1080,9 +1352,9 @@ def build_overview():
12,
"Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 16},
{"h": 9, "w": 8, "x": 8, "y": 20},
links=namespace_scope_links("namespace_scope_gpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
panels.append(
@ -1090,9 +1362,9 @@ def build_overview():
13,
"Namespace RAM Share",
namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 16},
{"h": 9, "w": 8, "x": 16, "y": 20},
links=namespace_scope_links("namespace_scope_ram"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
@ -1102,7 +1374,7 @@ def build_overview():
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 32},
{"h": 12, "w": 12, "x": 0, "y": 36},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1116,7 +1388,7 @@ def build_overview():
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 32},
{"h": 12, "w": 12, "x": 12, "y": 36},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1131,7 +1403,7 @@ def build_overview():
16,
"Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44},
{"h": 10, "w": 12, "x": 0, "y": 48},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1143,7 +1415,7 @@ def build_overview():
17,
"Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44},
{"h": 10, "w": 12, "x": 12, "y": 48},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1156,7 +1428,7 @@ def build_overview():
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
{"h": 10, "w": 12, "x": 0, "y": 58},
)
)
panels.append(
@ -1164,7 +1436,7 @@ def build_overview():
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
{"h": 10, "w": 12, "x": 12, "y": 58},
unit="none",
limit=12,
decimals=0,
@ -1186,7 +1458,7 @@ def build_overview():
18,
"Cluster Ingress Throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 25},
{"h": 7, "w": 8, "x": 0, "y": 29},
unit="Bps",
legend="Ingress (Traefik)",
legend_display="list",
@ -1199,7 +1471,7 @@ def build_overview():
19,
"Cluster Egress Throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 25},
{"h": 7, "w": 8, "x": 8, "y": 29},
unit="Bps",
legend="Egress (Traefik)",
legend_display="list",
@ -1212,7 +1484,7 @@ def build_overview():
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 25},
{"h": 7, "w": 8, "x": 16, "y": 29},
unit="Bps",
legend="Internal traffic",
legend_display="list",
@ -1226,7 +1498,7 @@ def build_overview():
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 64},
{"h": 16, "w": 12, "x": 0, "y": 68},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1241,7 +1513,7 @@ def build_overview():
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 64},
{"h": 16, "w": 12, "x": 12, "y": 68},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
@ -1727,7 +1999,7 @@ def build_storage_dashboard():
stat_panel(
31,
"Maintenance Cron Freshness (s)",
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
{"h": 4, "w": 12, "x": 12, "y": 44},
unit="s",
thresholds={
@ -2136,6 +2408,285 @@ def build_mail_dashboard():
}
def build_jobs_dashboard():
panels = []
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
recent_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 6},
{"color": "green", "value": 24},
],
}
task_error_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
panels.append(
bargauge_panel(
1,
"Ariadne Task Errors (range)",
ARIADNE_TASK_ERRORS_RANGE,
{"h": 7, "w": 8, "x": 0, "y": 0},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
panels.append(
bargauge_panel(
3,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 7, "w": 8, "x": 16, "y": 0},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=12,
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Glue Jobs Stale (>36h)",
GLUE_STALE_COUNT,
{"h": 4, "w": 4, "x": 0, "y": 7},
unit="none",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
)
)
panels.append(
stat_panel(
5,
"Glue Jobs Missing Success",
GLUE_MISSING_COUNT,
{"h": 4, "w": 4, "x": 4, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
6,
"Glue Jobs Suspended",
GLUE_SUSPENDED_COUNT,
{"h": 4, "w": 4, "x": 8, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
7,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H_TOTAL,
{"h": 4, "w": 4, "x": 12, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
8,
"Ariadne Task Errors (24h)",
ARIADNE_TASK_ERRORS_24H_TOTAL,
{"h": 4, "w": 4, "x": 16, "y": 7},
unit="none",
)
)
panels.append(
stat_panel(
9,
"Ariadne Task Runs (1h)",
ARIADNE_TASK_RUNS_1H_TOTAL,
{"h": 4, "w": 4, "x": 20, "y": 7},
unit="none",
)
)
panels.append(
bargauge_panel(
10,
"Ariadne Schedule Last Error (hours ago)",
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 17},
unit="h",
instant=True,
legend="{{task}}",
thresholds=recent_error_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
11,
"Ariadne Schedule Last Success (hours ago)",
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 17},
unit="h",
instant=True,
legend="{{task}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
12,
"Glue Jobs Last Success (hours ago)",
GLUE_LAST_SUCCESS_RANGE_HOURS,
{"h": 6, "w": 12, "x": 0, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
13,
"Glue Jobs Last Schedule (hours ago)",
GLUE_LAST_SCHEDULE_RANGE_HOURS,
{"h": 6, "w": 12, "x": 12, "y": 23},
unit="h",
instant=True,
legend="{{namespace}}/{{cronjob}}",
thresholds=age_thresholds,
decimals=2,
)
)
panels.append(
bargauge_panel(
14,
"Ariadne Task Errors (1h)",
ARIADNE_TASK_ERRORS_1H,
{"h": 6, "w": 12, "x": 0, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
15,
"Ariadne Task Errors (30d)",
ARIADNE_TASK_ERRORS_30D,
{"h": 6, "w": 12, "x": 12, "y": 29},
unit="none",
instant=True,
legend="{{task}}",
thresholds=task_error_thresholds,
)
)
panels.append(
bargauge_panel(
16,
"Ariadne Access Requests",
ARIADNE_ACCESS_REQUESTS,
{"h": 6, "w": 8, "x": 0, "y": 11},
unit="none",
instant=True,
legend="{{status}}",
)
)
panels.append(
stat_panel(
17,
"Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent",
decimals=1,
instant=True,
legend="{{branch}}",
)
)
panels.append(
table_panel(
18,
"Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11},
unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
instant=True,
)
)
return {
"uid": "atlas-jobs",
"title": "Atlas Jobs",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-7d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "jobs", "glue"],
}
def build_gpu_dashboard():
panels = []
gpu_scope = "$namespace_scope_gpu"
@ -2146,7 +2697,7 @@ def build_gpu_dashboard():
namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Values are normalized within the selected scope; use panel links to switch scope.",
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
panels.append(
@ -2165,7 +2716,7 @@ def build_gpu_dashboard():
timeseries_panel(
3,
"GPU Util by Node",
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
gpu_util_by_hostname(),
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
@ -2229,6 +2780,10 @@ DASHBOARDS = {
"builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
},
"atlas-jobs": {
"builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",

Some files were not shown because too many files have changed in this diff Show More