Compare commits

..

330 Commits

Author SHA1 Message Date
jenkins
4beb08f1cf scheduling: keep longhorn vault sync off storage nodes 2026-05-05 13:46:19 -03:00
jenkins
e2cbbd6963 scheduling: keep singleton apps off storage nodes 2026-05-05 13:37:04 -03:00
jenkins
c46764e80c recovery(atlas): stop post-outage control-plane churn 2026-05-05 10:42:28 -03:00
jenkins
b81053aaec ai(ollama): recover onto live jetson gpu pool 2026-05-05 06:42:15 -03:00
jenkins
9e659b790b recovery(post-outage): restore jellyfin and maintenance sync 2026-05-05 06:31:09 -03:00
jenkins
c07220253e maintenance(metis): run service on longhorn-ready workers 2026-05-05 06:19:15 -03:00
jenkins
39fb0e91e0 maintenance(metis): move runtime state to longhorn 2026-05-05 06:15:22 -03:00
jenkins
6243021ade maintenance(metis): recover on arm64 builders 2026-05-05 06:12:06 -03:00
4a6b54b4c3 logging: trim dated pod log rotations 2026-04-27 16:49:11 -03:00
6c816e9fad logging: trim constrained pod logs earlier 2026-04-27 16:42:02 -03:00
2b5c7ca10b logging: trim oversized rotated pod logs on constrained nodes 2026-04-27 16:31:57 -03:00
45b145667a longhorn: rerun settings ensure job 2026-04-27 16:16:51 -03:00
9fb8dd4839 stability: harden fluent-bit buffering and longhorn node-down recovery 2026-04-27 16:15:13 -03:00
flux-bot
6352e0d976 chore(maintenance): automated image update 2026-04-26 00:59:25 +00:00
flux-bot
d4ff5d482e chore(maintenance): automated image update 2026-04-26 00:59:05 +00:00
flux-bot
b303add71c chore(maintenance): automated image update 2026-04-26 00:57:30 +00:00
flux-bot
a42e61de61 chore(maintenance): automated image update 2026-04-26 00:55:05 +00:00
Codex
6eb0158c6c maintenance(metis): raise remote build timeout 2026-04-25 01:41:36 -03:00
Codex
0171ffad38 keycloak(metis): seed node intranet ips in vault 2026-04-24 22:18:58 -03:00
flux-bot
84934a6d1c chore(maintenance): automated image update 2026-04-24 21:39:36 +00:00
flux-bot
98a2ade86d chore(maintenance): automated image update 2026-04-24 21:39:18 +00:00
flux-bot
738a5184cb chore(maintenance): automated image update 2026-04-24 21:37:35 +00:00
flux-bot
488c2694e3 chore(maintenance): automated image update 2026-04-24 21:36:19 +00:00
flux-bot
015d99dc5f chore(maintenance): automated image update 2026-04-24 21:08:32 +00:00
flux-bot
b80745dc2d chore(maintenance): automated image update 2026-04-24 21:08:15 +00:00
jenkins
0fa1b38f95 recovery(metis): trim node vault password placeholders 2026-04-24 18:07:35 -03:00
flux-bot
49e714c88c chore(maintenance): automated image update 2026-04-24 21:07:32 +00:00
flux-bot
ff0b9762b1 chore(maintenance): automated image update 2026-04-24 21:05:15 +00:00
jenkins
ce36ff099b recovery(metis): rerun node password seeding job 2026-04-24 17:33:40 -03:00
jenkins
6c4a7dea29 recovery(metis): use atlas kv node secrets 2026-04-24 17:29:58 -03:00
jenkins
04a80c1168 recovery(metis): seed per-node vault password slots 2026-04-24 17:24:37 -03:00
flux-bot
8179bd85db chore(maintenance): automated image update 2026-04-24 20:19:26 +00:00
flux-bot
c08499b52d chore(maintenance): automated image update 2026-04-24 20:19:10 +00:00
flux-bot
eca9e494ad chore(maintenance): automated image update 2026-04-24 20:17:26 +00:00
flux-bot
ab0e68f9f3 chore(maintenance): automated image update 2026-04-24 20:15:10 +00:00
flux-bot
0566a47e35 chore(maintenance): automated image update 2026-04-24 17:50:13 +00:00
flux-bot
133597bfd0 chore(maintenance): automated image update 2026-04-24 17:49:55 +00:00
flux-bot
ccf318f977 chore(maintenance): automated image update 2026-04-24 17:48:12 +00:00
flux-bot
8affc052bf chore(maintenance): automated image update 2026-04-24 17:46:54 +00:00
flux-bot
0cf5043977 chore(maintenance): automated image update 2026-04-24 17:20:52 +00:00
flux-bot
f2ffc6c1ef chore(maintenance): automated image update 2026-04-24 17:19:09 +00:00
flux-bot
e7c770b10b chore(maintenance): automated image update 2026-04-24 17:17:52 +00:00
jenkins
0ac3c97f90 maintenance(metis): restore full helper image refs 2026-04-24 13:51:12 -03:00
flux-bot
3e5e37d65a chore(maintenance): automated image update 2026-04-24 16:11:02 +00:00
flux-bot
2acbcbff51 chore(maintenance): automated image update 2026-04-24 16:10:45 +00:00
flux-bot
70b382bc80 chore(maintenance): automated image update 2026-04-24 16:09:02 +00:00
flux-bot
d0191361d4 chore(maintenance): automated image update 2026-04-24 16:06:44 +00:00
flux-bot
59bb0bef78 chore(maintenance): automated image update 2026-04-24 15:56:37 +00:00
jenkins
4b456cf54a maintenance(metis): track arch-specific images 2026-04-24 12:55:47 -03:00
jenkins
91c6023d25 maintenance(metis): move ingress to recovery host 2026-04-24 10:51:09 -03:00
jenkins
85d15cd3e1 maintenance(metis): raise remote pod timeout for recovery builds 2026-04-24 00:01:43 -03:00
jenkins
c0a4cbf03e maintenance(metis): fix remote workspace permissions 2026-04-23 23:45:18 -03:00
jenkins
fad895efbb maintenance(metis): move build scratch to usb storage 2026-04-23 23:37:00 -03:00
jenkins
47b31ebcf4 monitoring(testing): collapse heavy drilldowns 2026-04-22 16:56:52 -03:00
jenkins
88d2225774 test(titan-iac): cover dashboard generator contract 2026-04-22 15:31:36 -03:00
jenkins
a1f6758b95 monitoring(grafana): refresh provisioned dashboards 2026-04-22 15:13:26 -03:00
jenkins
23146aaa8a monitoring(testing): clean canonical suite rows 2026-04-22 14:34:40 -03:00
jenkins
cc757ba082 ci(data-prepper): quote testcase metrics correctly 2026-04-22 13:28:35 -03:00
jenkins
c3c8b60671 ci(data-prepper): retrigger archive fix 2026-04-22 13:23:23 -03:00
jenkins
15792b1cf3 ci(data-prepper): archive junit without plugin dependency 2026-04-22 13:21:52 -03:00
jenkins
e75a5d5675 ci(data-prepper): keep validation labels portable 2026-04-22 13:13:56 -03:00
jenkins
4282810602 ci(data-prepper): retrigger quality publish 2026-04-22 13:07:37 -03:00
jenkins
8a58132dd4 ci(data-prepper): avoid xml parser in metrics publish 2026-04-22 13:04:47 -03:00
jenkins
be0d3e4300 ci(data-prepper): harden quality evidence helpers 2026-04-22 12:58:27 -03:00
jenkins
ba6848a67a ci(data-prepper): publish real testcase metrics 2026-04-22 12:48:36 -03:00
jenkins
23beb08e5e monitoring(testing): split quality trend panels 2026-04-22 12:42:33 -03:00
5d560d962d chore(metis): deploy scratch annotation sync 2026-04-22 04:28:08 -03:00
51ade59a46 fix(metis): keep sentinel rollouts moving on degraded nodes 2026-04-22 03:40:28 -03:00
7f91be27f9 chore(metis): deploy scratch sentinel fix 2026-04-22 03:33:54 -03:00
63cd159151 test(titan-iac): cover mailu sync scripts 2026-04-22 02:53:00 -03:00
443c70d01b monitoring(testing): promote atlas testing layout 2026-04-22 02:26:31 -03:00
flux-bot
9f0ea1683a chore(bstein-dev-home): automated image update 2026-04-22 05:01:25 +00:00
flux-bot
55df293e00 chore(bstein-dev-home): automated image update 2026-04-22 05:00:26 +00:00
3168ffe027 ci(titan-iac): feed coverage into sonar gate 2026-04-22 01:57:19 -03:00
abdefbbd05 ci(quality): enforce sonar and supply-chain gates 2026-04-22 01:29:54 -03:00
flux-bot
ead503d71e chore(bstein-dev-home): automated image update 2026-04-22 04:15:46 +00:00
flux-bot
f54bdf8483 chore(bstein-dev-home): automated image update 2026-04-22 04:14:49 +00:00
flux-bot
80cb4c257f chore(bstein-dev-home): automated image update 2026-04-22 04:06:45 +00:00
flux-bot
228e8a9772 chore(bstein-dev-home): automated image update 2026-04-22 04:05:50 +00:00
15c798b915 gitops(bstein-home): deploy current image tags on main 2026-04-22 00:53:06 -03:00
2ded2eb23d ci(titan-iac): apply supply-chain waiver ledger 2026-04-22 00:42:03 -03:00
flux-bot
e6bb015ef2 chore(maintenance): automated image update 2026-04-22 03:26:48 +00:00
flux-bot
ead7c276b4 chore(maintenance): automated image update 2026-04-22 03:11:42 +00:00
bfad9c19c5 deploy(bstein-home): target non-root frontend port 2026-04-22 00:01:50 -03:00
439a44bc85 ci(data-prepper): scan staged supply-chain inputs 2026-04-21 23:29:53 -03:00
flux-bot
13f179d842 chore(maintenance): automated image update 2026-04-22 02:09:28 +00:00
c0e5df30d5 ci(quality): use preloaded scanner image 2026-04-21 22:50:53 -03:00
flux-bot
79fbf2644b chore(maintenance): automated image update 2026-04-22 01:50:20 +00:00
0eca6adbbb ci(quality): pass sonar token as login 2026-04-21 22:17:55 -03:00
5801633b30 ci(quality): run sonar and supply-chain scans 2026-04-21 22:09:06 -03:00
fac139fd0e monitoring: rotate grafana dedupe job 2026-04-21 21:25:05 -03:00
jenkins
2df830f01b longhorn: bound settings sync curl calls and rerun job 2026-04-21 21:18:41 -03:00
flux-bot
26fab34de5 chore(maintenance): automated image update 2026-04-22 00:16:57 +00:00
jenkins
e29d0fe349 longhorn: rebalance replicas and cap rebuild pressure 2026-04-21 21:12:19 -03:00
jenkins
77f7620eca scheduling: de-prefer spillover nodes for non-longhorn services 2026-04-21 21:00:56 -03:00
fb0dd60954 jenkins: allow slow controller startup 2026-04-21 20:54:42 -03:00
jenkins
4401c26496 jenkins: de-prefer spillover longhorn nodes for controller and agents 2026-04-21 20:48:02 -03:00
9682a17a82 jenkins: avoid recursive volume ownership resets 2026-04-21 20:34:02 -03:00
55d87c0c14 ci(quality): bind sonarqube token credential in pipelines 2026-04-21 20:16:59 -03:00
379f20efc5 jenkins: prefer rpi5 without hard pin 2026-04-21 19:51:09 -03:00
7883593166 ci(jenkins): inject sonarqube token from vault 2026-04-21 19:43:08 -03:00
flux-bot
5509dd86d5 chore(maintenance): automated image update 2026-04-21 22:01:24 +00:00
06b27c9b9a ci(titan-iac): lower agent cpu request 2026-04-21 18:32:45 -03:00
flux-bot
a927affb1f chore(maintenance): automated image update 2026-04-21 21:22:18 +00:00
flux-bot
fab182e91e chore(maintenance): automated image update 2026-04-21 20:59:18 +00:00
d5be9e1ae9 ci(data-prepper): use mirrored base artifact 2026-04-21 16:56:25 -03:00
fb48d473d2 ci(data-prepper): report n/a coverage as complete 2026-04-21 16:32:42 -03:00
5e5cffbdc7 ci(data-prepper): allow arm64 worker scheduling 2026-04-21 15:33:42 -03:00
e1d804dbb0 ci(data-prepper): lower kaniko cpu request 2026-04-21 15:26:13 -03:00
flux-bot
2086427b72 chore(maintenance): automated image update 2026-04-21 17:56:42 +00:00
e811c0cabf ci(jenkins): require rpi5 controller placement 2026-04-21 14:12:14 -03:00
flux-bot
b68c002e2d chore(maintenance): automated image update 2026-04-21 17:05:21 +00:00
cb7e0238dc infra(ci): use harbor python utility images 2026-04-21 13:37:46 -03:00
flux-bot
043a2e75c8 chore(maintenance): automated image update 2026-04-21 16:30:12 +00:00
6ac375f82e ci(titan-iac): use harbor python runner 2026-04-21 13:18:31 -03:00
jenkins
8c1a26ead6 ci(titan-iac): use in-cluster victoria metrics dns 2026-04-21 12:30:06 -03:00
jenkins
d119f838e9 ci(titan-iac): harden quality metric publisher 2026-04-21 12:24:18 -03:00
jenkins
ae2356de6a monitoring(testing): render missing metric zero states 2026-04-21 11:46:15 -03:00
jenkins
c1ac36df17 monitoring(testing): link test metrics to build artifacts 2026-04-21 11:39:13 -03:00
jenkins
cc79f3ebcd ci(titan-iac): include primary branch in quality metrics 2026-04-21 11:08:59 -03:00
jenkins
1f991fc43d harbor: expand registry storage 2026-04-21 10:56:27 -03:00
jenkins
b62980b76d harbor: reduce vault injector bootstrap requests 2026-04-21 10:08:39 -03:00
jenkins
26da4945ea harbor: move registry bootstrap to titan-11 2026-04-21 09:55:29 -03:00
jenkins
d599a162a9 monitoring(testing): add branch evidence panels 2026-04-21 09:35:43 -03:00
jenkins
e53adc17b3 ci(data-prepper): archive full quality evidence 2026-04-21 09:24:09 -03:00
jenkins
7cd40d457d Merge remote-tracking branch 'origin/main' 2026-04-21 09:23:03 -03:00
flux-bot
d559d03bea chore(maintenance): automated image update 2026-04-21 06:32:37 +00:00
flux-bot
691dc3c71b chore(maintenance): automated image update 2026-04-21 06:27:29 +00:00
flux-bot
e81ecdd716 chore(maintenance): automated image update 2026-04-21 06:14:21 +00:00
flux-bot
74e385ad8b chore(maintenance): automated image update 2026-04-21 06:10:27 +00:00
flux-bot
fecd095717 chore(maintenance): automated image update 2026-04-21 06:03:10 +00:00
flux-bot
caa02806c0 chore(maintenance): automated image update 2026-04-21 06:00:02 +00:00
flux-bot
c6c6f90d26 chore(maintenance): automated image update 2026-04-21 05:54:02 +00:00
flux-bot
e4efb89466 chore(maintenance): automated image update 2026-04-21 05:52:01 +00:00
flux-bot
8584885ddd chore(maintenance): automated image update 2026-04-21 05:44:00 +00:00
flux-bot
6aeacaf872 chore(maintenance): automated image update 2026-04-21 05:42:00 +00:00
flux-bot
0146b92cc1 chore(maintenance): automated image update 2026-04-21 05:33:59 +00:00
flux-bot
981fca6cb4 chore(maintenance): automated image update 2026-04-21 05:26:59 +00:00
flux-bot
6dab28081d chore(maintenance): automated image update 2026-04-21 05:12:56 +00:00
flux-bot
6ebc475da2 chore(maintenance): automated image update 2026-04-21 05:05:56 +00:00
flux-bot
fff26ebacb chore(maintenance): automated image update 2026-04-21 04:57:54 +00:00
flux-bot
e3bebaa10b chore(maintenance): automated image update 2026-04-21 04:55:55 +00:00
flux-bot
df16f03e46 chore(maintenance): automated image update 2026-04-21 04:46:53 +00:00
flux-bot
b5243e8566 chore(maintenance): automated image update 2026-04-21 04:36:52 +00:00
flux-bot
4501bbf8f0 chore(maintenance): automated image update 2026-04-21 04:34:52 +00:00
flux-bot
5331d7149a chore(maintenance): automated image update 2026-04-21 04:24:51 +00:00
jenkins
c4b0389892 quality(titan-iac): widen enforced coverage contract 2026-04-20 21:39:53 -03:00
jenkins
387e104359 test(titan-iac): widen tracked quality coverage 2026-04-20 21:34:59 -03:00
jenkins
5ebc320843 ci(titan-iac): support direct script execution for metrics publish 2026-04-20 15:47:20 -03:00
jenkins
006f79658f ci(titan-iac): retrigger after titan-09 cordon 2026-04-20 15:36:51 -03:00
jenkins
9451bb9c61 test(titan-iac): raise quality gate coverage for quality runner 2026-04-20 15:29:46 -03:00
jenkins
655c26c589 quality(titan-iac): split metrics publisher and harden gate lint 2026-04-20 15:21:49 -03:00
jenkins
607d8c21fa monitoring(testing): fix missing-state queries and add test-case drilldowns 2026-04-20 13:45:01 -03:00
jenkins
b7f6cbd87c ci(titan-iac): enforce 30d build and artifact retention 2026-04-20 12:30:57 -03:00
jenkins
a07b49a05f monitoring(testing): fix atlas-jobs coverage and loc query expressions 2026-04-20 12:20:42 -03:00
jenkins
1d4227beec ci(data-prepper): add retention and archive quality artifacts 2026-04-20 10:55:13 -03:00
jenkins
57306201cf monitoring(testing): backfill placeholder test-case metrics across sparse suites 2026-04-20 09:13:34 -03:00
jenkins
7437ec5929 ci(titan-iac): emit placeholder test-case metric when junit has no cases 2026-04-20 09:10:04 -03:00
jenkins
710ec96990 test(titan-iac): update payload unit tests for per-test metric argument 2026-04-20 08:50:39 -03:00
jenkins
cb1c41c6ea ci(titan-iac): infer coverage/loc metrics from quality summary artifacts 2026-04-20 08:43:21 -03:00
jenkins
e8823197f8 monitoring(testing): align test selector with exported job label 2026-04-20 08:38:38 -03:00
jenkins
c5b1302ff6 monitoring(testing): add fallbacks for problematic-test trend queries 2026-04-20 08:37:26 -03:00
jenkins
f02db9801c monitoring(testing): add per-test metrics and flaky-test panels 2026-04-20 08:35:05 -03:00
jenkins
7d113291c9 monitoring(testing): split check trends into per-check success/failure panels 2026-04-20 08:07:30 -03:00
jenkins
47d5416dde ci(titan-iac): harden promote git workspace detection 2026-04-20 00:59:24 -03:00
codex
f2c4204bab monitoring(testing): fix suite all filter aliases and regex templating 2026-04-19 23:22:34 -03:00
codex
71cfdce862 jenkins: source streaming harbor creds from dedicated vault path 2026-04-19 23:02:30 -03:00
codex
d4112e5a74 ci(titan-iac): guard promote stage when workspace lacks .git 2026-04-19 22:58:58 -03:00
codex
6d2c72ff98 jenkins: keep streaming creds optional without vault hard dependency 2026-04-19 22:45:25 -03:00
codex
c8f7cd6ec2 jenkins(logging): split streaming harbor credentials 2026-04-19 22:40:56 -03:00
codex
bd85143aa0 jenkins: stop overriding push creds with harbor-pull secret 2026-04-19 22:36:18 -03:00
codex
cb992d1c53 maintenance(metis): raise remote timeout and improve progress 2026-04-19 22:34:16 -03:00
codex
7be6cfb9cb ci(titan-iac): install git in runner before promote stage 2026-04-19 22:33:22 -03:00
codex
b848e6b6d8 monitoring(dashboards): regenerate atlas-testing from generator 2026-04-19 22:29:20 -03:00
flux-bot
849bba8f5d chore(maintenance): automated image update 2026-04-20 01:19:35 +00:00
codex
86c492d8c1 ci: retrigger titan-iac after titan-18 cordon 2026-04-19 22:07:10 -03:00
codex
1ed8b7233d maintenance(metis): roll duplicate-build fix to 0.1.0-24 2026-04-19 22:03:04 -03:00
codex
ddabda06bf ci: fix data-prepper defaults and restore metrics publisher coverage 2026-04-19 21:57:40 -03:00
codex
881c724725 jenkins: revert sonar vault path injection blocking startup 2026-04-19 21:42:04 -03:00
codex
2db4952c39 jenkins(sonar): wire defaults and observe-mode toggles 2026-04-19 21:30:02 -03:00
codex
57432e01a3 maintenance(metis): export bastion ssh key for replacement readiness 2026-04-19 21:22:57 -03:00
codex
97bc0cea8c maintenance(metis): use inventory path available in remote runner pods 2026-04-19 21:18:30 -03:00
codex
e930aac039 ci(gate): enforce sonar and supply-chain checks across suites 2026-04-19 21:16:42 -03:00
flux-bot
13ec9b2d7d chore(maintenance): automated image update 2026-04-20 00:14:29 +00:00
d8f07c2b70 maintenance(metis): run vault-enabled metis service image 2026-04-19 21:14:19 -03:00
20a255252c maintenance(metis): add titan-16 replacement profile 2026-04-19 21:01:49 -03:00
376e68ec31 maintenance(metis): inject harbor creds into service runtime 2026-04-19 20:52:04 -03:00
flux-bot
7497f8d4e0 chore(maintenance): automated image update 2026-04-19 23:45:10 +00:00
b3270e7231 maintenance(metis): add titan-10 and titan-12 inventory profiles 2026-04-19 20:44:12 -03:00
1dce63fb9b monitoring(testing): render zero-state data for missing/sonar panels 2026-04-19 16:56:22 -03:00
96f3844677 quality(sonarqube): read exporter token from shared oidc vault path 2026-04-19 16:40:39 -03:00
65edbd9ed9 quality(sonarqube): inject exporter token from vault 2026-04-19 16:34:27 -03:00
29138b8a51 ci(metrics): publish canonical titan-iac gate checks 2026-04-19 16:29:07 -03:00
flux-bot
aede5aa899 chore(maintenance): automated image update 2026-04-19 19:19:49 +00:00
12293c9d11 test(ci): align publish_test_metrics unit tests with current API 2026-04-19 16:18:35 -03:00
2d0360be3b ci(metrics): use Pushgateway PUT for suite payload replacement 2026-04-19 16:10:20 -03:00
f9d7694f25 monitoring(testing): harden suite selector and success history query 2026-04-19 15:31:59 -03:00
9e3cc0f760 ci(jenkins): fix glue test VM URL and default SA observer RBAC 2026-04-19 15:06:13 -03:00
32410555cd monitoring: remove combined UPS draw series from history panels 2026-04-19 14:51:25 -03:00
347e7ccc84 monitoring: revert atlas overview dashboard to pre-quality changes 2026-04-19 14:43:41 -03:00
e47a877169 ci: resolve flux branch without Groovy dollar interpolation 2026-04-19 14:41:22 -03:00
592d037522 ci: fix titan-iac and data-prepper pipeline gate publishing 2026-04-19 14:33:26 -03:00
3ccc2a1100 quality: standardize suite checks and add SonarQube stack 2026-04-19 14:18:58 -03:00
9a20f4f854 monitoring(testing): redesign atlas testing dashboard and unify suite aliases 2026-04-18 17:47:06 -03:00
9a8c454123 tests(quality-gate): cover metrics publisher edge paths 2026-04-18 17:29:50 -03:00
flux-bot
e1f430455d chore(maintenance): automated image update 2026-04-18 19:36:24 +00:00
01fe20fe68 monitoring(metrics): normalize platform gate contract and pegasus suite name 2026-04-18 16:34:20 -03:00
2221a2d279 monitoring: alert on soteria backup job creation spikes 2026-04-17 01:09:25 -03:00
flux-bot
20305a7181 chore(maintenance): automated image update 2026-04-17 03:48:15 +00:00
10c813d583 maintenance(soteria): pause backup scheduler during backlog incident 2026-04-16 21:29:14 -03:00
1b041aa813 monitoring(dashboards): fix success-rate fallback expression 2026-04-16 20:02:26 -03:00
8f2b247b5f monitoring(dashboards): fallback idle panels to zero 2026-04-16 19:59:08 -03:00
1f3ce453fb maintenance(soteria): add startup probe and relax liveness 2026-04-16 19:54:07 -03:00
ff11f7ee65 monitoring(vm): raise kube-state-metrics scrape size cap 2026-04-16 19:47:56 -03:00
11d9c5eae3 monitoring(vm): avoid accelerator nodes for vmsingle 2026-04-16 19:39:35 -03:00
95dd0bbd56 monitoring(vm): auto-reload scrape config changes 2026-04-16 19:33:39 -03:00
72e7a39373 monitoring: fix grafana no-data scrape gaps 2026-04-16 19:30:31 -03:00
09d438e8b4 maintenance(titan-24): remove flux temp desktop automation 2026-04-15 22:58:37 -03:00
6752e4c0e5 maintenance(titan-24): keep helper retries armed 2026-04-15 22:50:41 -03:00
e7f3edb4bf maintenance(titan-24): tolerate unreachable helper jobs 2026-04-15 22:30:22 -03:00
c55d5ac3b5 maintenance(titan-24): add desktop helper and rootfs sweep 2026-04-15 22:25:11 -03:00
fb43b02b2a monitoring(soteria): tune PVC backup age thresholds for nightly cadence 2026-04-14 02:17:52 -03:00
55fa72d446 monitoring(overview): align enclosure fonts and shorten fan labels 2026-04-14 01:18:41 -03:00
496f7a12dd monitoring(overview): dedupe typhon series and map fans by port 2026-04-14 00:31:38 -03:00
6b75ae7dcc monitoring(overview): fix jenkins success/failure ranking with single-frame status labels 2026-04-13 23:13:45 -03:00
50a9bda808 typhon: register app and add v2-safe ble/control runtime toggles 2026-04-13 23:07:53 -03:00
c573012a7c monitoring(overview): globally sort jenkins rows across status frames 2026-04-13 23:03:38 -03:00
8ac428f816 monitoring(overview): derive jenkins top-6 in PromQL per panel 2026-04-13 22:38:40 -03:00
99e7dababd monitoring(overview): restore jenkins panel readability with top-6 stat rows 2026-04-13 22:13:08 -03:00
8db72c9475 monitoring(overview): replace jenkins tables with stat lists and fix links/colors 2026-04-13 22:07:24 -03:00
2db8e1423d monitoring(overview): fix jenkins row links, status color, and ordering 2026-04-13 20:58:09 -03:00
flux-bot
3e440ba7cd chore(maintenance): automated image update 2026-04-13 19:52:06 +00:00
e437f55d87 monitoring(overview): make jenkins success/failure panels scrollable lists 2026-04-13 16:24:19 -03:00
3bbd0a6f90 monitoring(jenkins): dedupe weather metrics and cap newest list rows 2026-04-13 14:29:44 -03:00
cf988e361b monitoring(overview): make jenkins success/failure lists readable 2026-04-13 14:25:19 -03:00
flux-bot
7f676fdc70 chore(maintenance): automated image update 2026-04-13 17:21:53 +00:00
flux-bot
f2830ce940 chore(maintenance): automated image update 2026-04-13 16:58:51 +00:00
a05a6a0e88 monitoring(overview): increase jenkins success/failure row legibility 2026-04-13 13:51:03 -03:00
30acfe39c4 maintenance(soteria): grant pod logs and roll out 0.1.0-32 2026-04-13 12:52:38 -03:00
flux-bot
ac62a43815 chore(maintenance): automated image update 2026-04-13 15:49:45 +00:00
4bcb1cc940 monitoring(overview): split jenkins weather into success/failure columns 2026-04-13 12:17:34 -03:00
d0abf9a70d monitoring: slightly reduce fan activity value font 2026-04-13 12:08:01 -03:00
flux-bot
69ab8805a9 chore(maintenance): automated image update 2026-04-13 15:06:41 +00:00
18666d5aec monitoring(jenkins): improve weather panel readability and layout 2026-04-13 11:52:40 -03:00
d847a731fb monitoring: increase ups current stat font size 2026-04-13 11:43:25 -03:00
9f9b00a6fb monitoring(jenkins): switch weather to single stat-list panel 2026-04-13 06:24:58 -03:00
28756ceda8 monitoring: align ups and climate cards to postgres two-stat pattern 2026-04-13 06:22:41 -03:00
56cca6df83 monitoring: rebuild split ups and climate cards from scratch 2026-04-13 06:12:29 -03:00
aa935984a8 monitoring: equalize split ups card heights and row spacing 2026-04-13 05:42:39 -03:00
a2172f56ec monitoring(overview): fix pvc backup health/age panel query 2026-04-13 05:33:28 -03:00
db701b89c2 monitoring(overview): add jenkins success and duration columns 2026-04-13 05:31:43 -03:00
ef352cbdc1 monitoring: prevent compact UPS card value clipping 2026-04-13 05:16:37 -03:00
f6b97ac82e monitoring: fix clipped values in compact split panel 2026-04-13 05:00:01 -03:00
0a28cf07c2 monitoring: force one-row value-only split panels 2026-04-13 04:51:03 -03:00
3dd0bc875d monitoring(jenkins): stop collapsing weather bars into one row 2026-04-13 04:32:13 -03:00
cf30f63fb4 typhon: schedule exporter on arm64 workers 2026-04-13 04:30:03 -03:00
2ae886ec74 monitoring: make split climate and ups panels value-only 2026-04-13 04:27:16 -03:00
4d10919ead monitoring(jenkins): render weather panels with exported job labels 2026-04-13 04:03:56 -03:00
c06ba41d0d monitoring: tighten split panel layout in overview 2026-04-13 03:53:06 -03:00
flux-bot
1ed1d6cf80 chore(maintenance): automated image update 2026-04-13 06:47:57 +00:00
f26d7afbbc monitoring: split climate and ups current panels 2026-04-13 03:35:50 -03:00
e5ffa94c1d maintenance(soteria): roll pvc-node pin fix and pod-read rbac 2026-04-13 03:31:57 -03:00
flux-bot
c2048fa594 chore(maintenance): automated image update 2026-04-13 06:30:55 +00:00
08cec8be77 maintenance(soteria): move restic vault path to shared scope 2026-04-13 03:00:57 -03:00
a6ff6122b0 maintenance(vault): roll sync pod after soteria secret mapping 2026-04-13 02:55:14 -03:00
0ffe1e1905 maintenance(jenkins): stabilize ariadne api token bootstrap 2026-04-13 02:55:10 -03:00
4e9b232a4f maintenance(soteria): source restic credentials from vault 2026-04-13 02:53:38 -03:00
b25422f1b4 maintenance(ariadne): restart to pick jenkins api creds 2026-04-13 02:45:29 -03:00
50c9852cff maintenance(jenkins): provision ariadne api user for weather collector 2026-04-13 02:41:20 -03:00
3d2f5c0778 monitoring(alerts): make soteria backup health rule driver-agnostic 2026-04-13 02:36:39 -03:00
flux-bot
206daf156a chore(maintenance): automated image update 2026-04-13 05:31:46 +00:00
f3e77ea994 Revert "monitoring(overview): recenter climate/ups cards and gate stale offline climate data"
This reverts commit 19d6ffcf2a4268fd414cbe5109aafd043d7bb514.
2026-04-13 02:26:09 -03:00
fbb4736d4a maintenance(soteria): roll pods after restic config switch 2026-04-13 02:24:05 -03:00
f02a782991 maintenance(soteria): enable restic encrypted backup mode 2026-04-13 02:23:01 -03:00
6f96f7b78f maintenance(soteria): fix duplicate b2 config keys 2026-04-13 02:21:25 -03:00
4fb0b371ff maintenance(soteria): switch to encrypted restic backups 2026-04-13 02:15:46 -03:00
flux-bot
4c671a5396 chore(maintenance): automated image update 2026-04-13 05:13:43 +00:00
flux-bot
3c675fd887 chore(maintenance): automated image update 2026-04-13 05:03:42 +00:00
2243072be2 maintenance(flux): update ariadne automation on main 2026-04-13 02:03:20 -03:00
19d6ffcf2a monitoring(overview): recenter climate/ups cards and gate stale offline climate data 2026-04-13 01:43:21 -03:00
53a20a8560 maintenance(soteria): avoid titan-10 scheduling 2026-04-13 01:16:59 -03:00
f1bb65cb73 monitoring(overview): center climate/ups cards and add UPS discharge risk coloring 2026-04-13 01:08:58 -03:00
0576de7a61 maintenance(soteria): roll snapshot-first backup fix image 2026-04-13 00:42:15 -03:00
c409c7ca80 monitoring(jobs): add jenkins build weather job list panels 2026-04-13 00:26:22 -03:00
f2aab54884 monitoring(overview): add fixed labels to canvas 2x2 stat cards 2026-04-13 00:21:56 -03:00
e6785f7db1 monitoring(overview): fix ups/climate 2x2 cards and dynamic climate axes 2026-04-13 00:18:06 -03:00
d514fb35e5 longhorn(core): restore b2 secret objects in vault sync 2026-04-12 23:54:35 -03:00
41a5add906 monitoring(climate): drop zero samples to unlock dynamic history scaling 2026-04-12 23:02:23 -03:00
00fe5e8a0f monitoring(testing): add coverage and code-smell infraction panels 2026-04-12 22:58:33 -03:00
3a148c63e4 monitoring(overview): rebalance climate row widths for current/history panels 2026-04-12 22:57:25 -03:00
f17fa41207 monitoring(overview): restore single-panel cards and dynamic climate axes 2026-04-12 22:53:46 -03:00
d642deb4f4 maintenance(soteria): fix prometheus scrape port to 8080 2026-04-12 22:36:51 -03:00
51e35b8643 monitoring(overview): stack ups current card into draw/runtime rows 2026-04-12 22:25:34 -03:00
e53933ece7 monitoring(overview): stack climate stats into explicit 2x2 rows 2026-04-12 22:19:37 -03:00
4efd28c956 Revert "monitoring(overview): force horizontal stat cards for climate/ups wrap"
This reverts commit 287c339aa0001c1daec161fd9fc73fbd4b267b48.
2026-04-12 22:14:59 -03:00
a1ab78b0c9 monitoring(grafana): mount and provision atlas-testing dashboard 2026-04-12 22:13:58 -03:00
287c339aa0 monitoring(overview): force horizontal stat cards for climate/ups wrap 2026-04-12 22:11:40 -03:00
dc1f1cbb7c monitoring(overview): split climate and ups stats into two-row query groups 2026-04-12 22:07:58 -03:00
4a10163b10 monitoring(overview): tune stat sizing for 2x2 climate/ups cards 2026-04-12 22:03:13 -03:00
f45217f98e monitoring(overview): simplify ups current card to draw/runtime 2026-04-12 21:36:42 -03:00
66da1b3aab monitoring(overview): shorten ups labels for readable stat rows 2026-04-12 21:32:48 -03:00
8d30fddd7d monitoring(overview): wrap ups and climate stats for narrow panels 2026-04-12 21:28:14 -03:00
a0f1149bbb monitoring(overview): restore readable two-row stats for ups and climate 2026-04-12 21:23:28 -03:00
d2672300a3 monitoring(jobs): switch cleanup stats to two-row layout 2026-04-12 20:38:52 -03:00
ed5a59f21d maintenance(soteria): set explicit b2 endpoint and bucket 2026-04-12 20:31:02 -03:00
66bd705971 monitoring: tune stat text sizing for climate and ups 2026-04-12 20:30:17 -03:00
4b78e67036 monitoring: use wide stat layout for ups and climate cards 2026-04-12 20:23:38 -03:00
3a4bdbd42f monitoring: switch ups/climate/fan stats to vertical orientation 2026-04-12 20:12:17 -03:00
e222344cd9 monitoring(jobs): add schedule fallback series for cold starts 2026-04-12 20:09:43 -03:00
a1257b65ff maintenance(ariadne): roll image to 0.1.0-103 for cleanup rollout 2026-04-12 20:06:03 -03:00
299a68ad95 monitoring(jobs): split testing dashboard and clean up job ops view 2026-04-12 20:06:03 -03:00
049a0deb04 maintenance(soteria): roll react ui image and wire b2 monitoring 2026-04-12 20:04:35 -03:00
7d3b12c774 monitoring: restore stat layout for ups/climate/fan rows 2026-04-12 19:56:12 -03:00
ac71b4621c monitoring: render ups/climate/fan panels as row tables 2026-04-12 19:46:39 -03:00
3271369e2d monitoring: set compact stat layout for climate and ups rows 2026-04-12 19:37:08 -03:00
931ee5944d monitoring: pack overview/power stats horizontally 2026-04-12 19:23:10 -03:00
08077f46c6 monitoring(atlas-power): force horizontal layout for stat rows 2026-04-12 19:06:07 -03:00
b9b9308500 maintenance(soteria): roll image to 0.1.0-22 for oauth2 headers 2026-04-12 18:55:09 -03:00
3096e0d7de monitoring(overview): tighten climate labels and drop duplicate temp line 2026-04-12 18:50:25 -03:00
9f5c9bfb86 maintenance(soteria): re-enable flux management for workload resources 2026-04-12 18:41:56 -03:00
6b0d6b017c monitoring(overview): tune climate row and restore ups card density 2026-04-12 18:35:15 -03:00
de3272e160 merge: atlas jobs ariadne schedule observability 2026-04-12 18:33:07 -03:00
8a413c0024 merge: lane2 jenkins cleanup activate 2026-04-12 18:33:00 -03:00
aa24e08744 merge: lane2 jenkins cleanup wiring 2026-04-12 18:32:48 -03:00
cb27592272 monitoring(overview): reflow UPS/climate rows and add jenkins weather 2026-04-12 18:14:54 -03:00
f67ca30f94 monitoring(climate): add C/F history and dedupe typhon series 2026-04-12 17:56:54 -03:00
4864939eef maintenance(ariadne): activate jenkins workspace cleanup deletes 2026-04-12 15:01:35 -03:00
01ecb75c5b scripts: default cleanup verifier to maintenance kustomization 2026-04-12 15:01:11 -03:00
fa30ea0ac2 scripts: add jenkins cleanup rollout verifier 2026-04-12 12:32:20 -03:00
2509d8876a maintenance(ariadne): default jenkins cleanup to safe dry-run 2026-04-12 12:32:20 -03:00
130 changed files with 22873 additions and 3933 deletions

324
Jenkinsfile vendored
View File

@ -7,14 +7,24 @@ pipeline {
apiVersion: v1
kind: Pod
spec:
serviceAccountName: "jenkins"
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: python:3.12-slim
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
command:
- cat
tty: true
@ -24,9 +34,21 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan-iac'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -36,7 +58,175 @@ spec:
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
}
}
stage('Prepare local quality evidence') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
@ -66,8 +256,96 @@ spec:
stage('Enforce quality gate') {
steps {
sh '''
set -eu
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
}
}
@ -76,7 +354,7 @@ spec:
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: '''awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml'''
script: "grep -m1 '^\\s*branch:' clusters/atlas/flux-system/gotk-sync.yaml | sed 's/^\\s*branch:\\s*//'"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
@ -93,16 +371,28 @@ spec:
}
}
steps {
container('jnlp') {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}

View File

@ -6,14 +6,24 @@ pipeline {
apiVersion: v1
kind: Pod
spec:
serviceAccountName: "jenkins"
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: python:3.12-slim
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
command:
- cat
tty: true
@ -23,9 +33,21 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan-iac'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -35,7 +57,175 @@ spec:
}
stage('Install deps') {
steps {
sh 'pip install --no-cache-dir -r ci/requirements.txt'
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
}
}
stage('Prepare local quality evidence') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
@ -65,8 +255,96 @@ spec:
stage('Enforce quality gate') {
steps {
sh '''
set -eu
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
}
}
@ -75,7 +353,7 @@ spec:
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: '''awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml'''
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
@ -92,16 +370,28 @@ spec:
}
}
steps {
container('jnlp') {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
git push origin HEAD:${FLUX_BRANCH}
'''
}
}
}

View File

@ -6,30 +6,50 @@ from __future__ import annotations
import json
import os
from glob import glob
from pathlib import Path
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from ci.scripts import publish_test_metrics_quality as _quality_helpers
CANONICAL_CHECKS = _quality_helpers.CANONICAL_CHECKS
_build_check_statuses = _quality_helpers._build_check_statuses
_combine_statuses = _quality_helpers._combine_statuses
_infer_sonarqube_status = _quality_helpers._infer_sonarqube_status
_infer_source_lines_over_500 = _quality_helpers._infer_source_lines_over_500
_infer_supply_chain_status = _quality_helpers._infer_supply_chain_status
_infer_workspace_coverage_percent = _quality_helpers._infer_workspace_coverage_percent
_load_optional_json = _quality_helpers._load_optional_json
_normalize_result_status = _quality_helpers._normalize_result_status
def _escape_label(value: str) -> str:
"""Escape a Prometheus label value without changing its content."""
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
def _label_str(labels: dict[str, str]) -> str:
"""Render a stable Prometheus label set from a mapping."""
parts = [f'{key}="{_escape_label(val)}"' for key, val in labels.items() if val]
return "{" + ",".join(parts) + "}" if parts else ""
def _read_text(url: str) -> str:
"""Fetch a plain-text response body from the given URL."""
with urllib.request.urlopen(url, timeout=10) as response:
return response.read().decode("utf-8")
def _post_text(url: str, payload: str) -> None:
"""PUT a plain-text payload and fail on any 4xx/5xx response."""
request = urllib.request.Request(
url,
data=payload.encode("utf-8"),
method="POST",
method="PUT",
headers={"Content-Type": "text/plain"},
)
with urllib.request.urlopen(request, timeout=10) as response:
@ -38,6 +58,7 @@ def _post_text(url: str, payload: str) -> None:
def _parse_junit(path: str) -> dict[str, int]:
"""Parse a JUnit XML file into aggregate test counters."""
if not os.path.exists(path):
return {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
@ -64,6 +85,7 @@ def _parse_junit(path: str) -> dict[str, int]:
def _collect_junit_totals(pattern: str) -> dict[str, int]:
"""Sum JUnit counters across every XML file matching the pattern."""
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
for path in sorted(glob(pattern)):
parsed = _parse_junit(path)
@ -72,7 +94,38 @@ def _collect_junit_totals(pattern: str) -> dict[str, int]:
return totals
def _collect_junit_cases(pattern: str) -> list[tuple[str, str]]:
"""Collect individual JUnit test-case statuses for flaky-test trend panels."""
cases: list[tuple[str, str]] = []
for path in sorted(glob(pattern)):
if not os.path.exists(path):
continue
root = ET.parse(path).getroot()
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = [elem for elem in root if elem.tag == "testsuite"]
else:
suites = []
for suite in suites:
for test_case in suite.findall("testcase"):
case_name = test_case.attrib.get("name", "").strip()
class_name = test_case.attrib.get("classname", "").strip()
if not case_name:
continue
full_name = f"{class_name}.{case_name}" if class_name else case_name
status = "passed"
if test_case.find("failure") is not None or test_case.find("error") is not None:
status = "failed"
elif test_case.find("skipped") is not None:
status = "skipped"
cases.append((full_name, status))
return cases
def _read_exit_code(path: str) -> int:
"""Read the quality-gate exit code, defaulting to failure if missing."""
try:
with open(path, "r", encoding="utf-8") as handle:
return int(handle.read().strip())
@ -81,6 +134,7 @@ def _read_exit_code(path: str) -> int:
def _load_summary(path: str) -> dict:
"""Load the JSON quality-gate summary, returning an empty mapping on error."""
try:
with open(path, "r", encoding="utf-8") as handle:
return json.load(handle)
@ -88,7 +142,26 @@ def _load_summary(path: str) -> dict:
return {}
def _summary_float(summary: dict, key: str) -> float:
"""Extract a float-like value from the summary, defaulting to 0.0."""
value = summary.get(key)
if isinstance(value, (int, float)):
return float(value)
return 0.0
def _summary_int(summary: dict, key: str) -> int:
"""Extract an int-like value from the summary, defaulting to 0."""
value = summary.get(key)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
return 0
def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str, str]) -> float:
"""Return the current counter value for a labeled metric if present."""
text = _read_text(f"{pushgateway_url.rstrip('/')}/metrics")
for line in text.splitlines():
if not line.startswith(metric + "{"):
@ -109,20 +182,33 @@ def _build_payload(
suite: str,
status: str,
tests: dict[str, int],
test_cases: list[tuple[str, str]],
ok_count: int,
failed_count: int,
branch: str,
build_number: str,
jenkins_job: str,
summary: dict | None = None,
workspace_line_coverage_percent: float = 0.0,
source_lines_over_500: int = 0,
check_statuses: dict[str, str] | None = None,
) -> str:
"""Build the Pushgateway payload for the current suite run."""
passed = max(tests["tests"] - tests["failures"] - tests["errors"] - tests["skipped"], 0)
build_labels = _label_str(
{
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
)
test_case_base_labels = {
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
lines = [
"# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
@ -135,37 +221,78 @@ def _build_payload(
"# TYPE titan_iac_quality_gate_run_status gauge",
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="ok"}} {1 if status == "ok" else 0}',
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="failed"}} {1 if status == "failed" else 0}',
"# TYPE platform_quality_gate_build_info gauge",
f"platform_quality_gate_build_info{build_labels} 1",
"# TYPE titan_iac_quality_gate_build_info gauge",
f"titan_iac_quality_gate_build_info{build_labels} 1",
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {workspace_line_coverage_percent:.3f}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
]
results = summary.get("results", []) if isinstance(summary, dict) else []
if results:
if check_statuses:
lines.append("# TYPE titan_iac_quality_gate_checks_total gauge")
for result in results:
check_name = result.get("name")
check_status = result.get("status")
if not check_name or not check_status:
continue
for check_name in CANONICAL_CHECKS:
check_status = check_statuses.get(check_name, "not_applicable")
lines.append(
f'titan_iac_quality_gate_checks_total{{suite="{suite}",check="{_escape_label(str(check_name))}",result="{_escape_label(str(check_status))}"}} 1'
f'titan_iac_quality_gate_checks_total{{suite="{suite}",check="{_escape_label(check_name)}",result="{_escape_label(check_status)}"}} 1'
)
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
if test_cases:
for test_name, test_status in test_cases:
labels = {
**test_case_base_labels,
"test": test_name,
"status": test_status,
}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
else:
labels = {**test_case_base_labels, "test": "__no_test_cases__", "status": "skipped"}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
return "\n".join(lines) + "\n"
def main() -> int:
suite = os.getenv("SUITE_NAME", "titan-iac")
"""Publish the quality-gate metrics and print a compact run summary."""
suite = os.getenv("SUITE_NAME", "titan_iac")
pushgateway_url = os.getenv("PUSHGATEWAY_URL", "http://platform-quality-gateway.monitoring.svc.cluster.local:9091")
job_name = os.getenv("QUALITY_GATE_JOB_NAME", "platform-quality-ci")
junit_glob = os.getenv("JUNIT_GLOB", os.getenv("JUNIT_PATH", "build/junit-*.xml"))
exit_code_path = os.getenv("QUALITY_GATE_EXIT_CODE_PATH", os.getenv("GLUE_EXIT_CODE_PATH", "build/quality-gate.rc"))
summary_path = os.getenv("QUALITY_GATE_SUMMARY_PATH", "build/quality-gate-summary.json")
branch = os.getenv("BRANCH_NAME", os.getenv("GIT_BRANCH", ""))
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "titan-iac")
tests = _collect_junit_totals(junit_glob)
test_cases = _collect_junit_cases(junit_glob)
exit_code = _read_exit_code(exit_code_path)
status = "ok" if exit_code == 0 else "failed"
summary = _load_summary(summary_path)
workspace_line_coverage_percent = _summary_float(summary, "workspace_line_coverage_percent")
if workspace_line_coverage_percent <= 0:
workspace_line_coverage_percent = _infer_workspace_coverage_percent(summary, "build/coverage-unit.xml")
source_lines_over_500 = _summary_int(summary, "source_lines_over_500")
if source_lines_over_500 <= 0:
source_lines_over_500 = _infer_source_lines_over_500(summary)
sonarqube_report = _load_optional_json(os.getenv("QUALITY_GATE_SONARQUBE_REPORT", "build/sonarqube-quality-gate.json"))
supply_chain_report = _load_optional_json(os.getenv("QUALITY_GATE_IRONBANK_REPORT", "build/ironbank-compliance.json"))
supply_chain_required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
check_statuses = _build_check_statuses(
summary=summary,
tests=tests,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
sonarqube_report=sonarqube_report,
supply_chain_report=supply_chain_report,
supply_chain_required=supply_chain_required,
)
ok_count = int(
_fetch_existing_counter(
@ -190,11 +317,16 @@ def main() -> int:
suite=suite,
status=status,
tests=tests,
test_cases=test_cases,
ok_count=ok_count,
failed_count=failed_count,
branch=branch,
build_number=build_number,
jenkins_job=jenkins_job,
summary=summary,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
check_statuses=check_statuses,
)
push_url = f"{pushgateway_url.rstrip('/')}/metrics/job/{job_name}/suite/{suite}"
_post_text(push_url, payload)
@ -208,11 +340,13 @@ def main() -> int:
"tests_skipped": tests["skipped"],
"ok_count": ok_count,
"failed_count": failed_count,
"checks_recorded": len(summary.get("results", [])) if isinstance(summary, dict) else 0,
"checks_recorded": len(check_statuses),
"workspace_line_coverage_percent": workspace_line_coverage_percent,
"source_lines_over_500": source_lines_over_500,
}
print(json.dumps(summary, sort_keys=True))
return 0
if __name__ == "__main__":
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""Quality/status helpers for publish_test_metrics."""
from __future__ import annotations
import json
from pathlib import Path
import xml.etree.ElementTree as ET
SUCCESS_STATUSES = {"ok", "pass", "passed", "success", "compliant"}
NOT_APPLICABLE_STATUSES = {"not_applicable", "n/a", "na", "none", "skipped"}
FAILED_STATUSES = {"failed", "fail", "error", "errors", "warn", "warning", "red"}
CANONICAL_CHECKS = [
"tests",
"coverage",
"loc",
"docs_naming",
"gate_glue",
"sonarqube",
"supply_chain",
]
def _infer_workspace_coverage_percent(summary: dict, default_xml: str) -> float:
"""Infer workspace line coverage from quality summary coverage XML metadata."""
results = summary.get("results", []) if isinstance(summary, dict) else []
coverage_xml = default_xml
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() != "coverage":
continue
candidate = str(result.get("coverage_xml") or "").strip()
if candidate:
coverage_xml = candidate
break
xml_path = Path(coverage_xml)
if not xml_path.exists():
return 0.0
try:
root = ET.parse(xml_path).getroot()
line_rate = root.attrib.get("line-rate")
if line_rate is None:
return 0.0
return float(line_rate) * 100.0
except (ET.ParseError, OSError, ValueError):
return 0.0
def _infer_source_lines_over_500(summary: dict) -> int:
"""Infer over-limit source file count from hygiene issue payloads."""
results = summary.get("results", []) if isinstance(summary, dict) else []
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() not in {"hygiene", "loc", "smell"}:
continue
issues = result.get("issues")
if not isinstance(issues, list):
continue
return sum(1 for item in issues if isinstance(item, str) and item.startswith("file exceeds"))
return 0
def _normalize_result_status(value: str | None, default: str = "failed") -> str:
"""Map arbitrary check status text into canonical check result buckets."""
if not value:
return default
normalized = value.strip().lower()
if normalized in SUCCESS_STATUSES:
return "ok"
if normalized in NOT_APPLICABLE_STATUSES:
return "not_applicable"
if normalized in FAILED_STATUSES:
return "failed"
return default
def _load_optional_json(path: str | None) -> dict:
"""Load an optional JSON report file, returning an empty object when absent."""
if not path:
return {}
candidate = Path(path)
if not candidate.exists():
return {}
try:
return json.loads(candidate.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
def _combine_statuses(statuses: list[str]) -> str:
"""Roll up many check statuses into one canonical result."""
if not statuses:
return "not_applicable"
if any(status == "failed" for status in statuses):
return "failed"
if all(status == "not_applicable" for status in statuses):
return "not_applicable"
if all(status in {"ok", "not_applicable"} for status in statuses):
return "ok"
return "failed"
def _infer_sonarqube_status(report: dict) -> str:
"""Infer canonical SonarQube check status from its JSON report payload."""
if not report:
return "not_applicable"
status = (
report.get("projectStatus", {}).get("status")
or report.get("qualityGate", {}).get("status")
or report.get("status")
)
return _normalize_result_status(str(status) if status is not None else None, default="failed")
def _infer_supply_chain_status(report: dict, required: bool) -> str:
"""Infer canonical supply-chain status from IronBank/artifact report payload."""
if not report:
return "failed" if required else "not_applicable"
compliant = report.get("compliant")
if isinstance(compliant, bool):
return "ok" if compliant else "failed"
status = report.get("status")
if status is None:
return "failed" if required else "not_applicable"
normalized = _normalize_result_status(str(status), default="failed")
if normalized == "not_applicable" and required:
return "failed"
return normalized
def _build_check_statuses(
summary: dict | None,
tests: dict[str, int],
workspace_line_coverage_percent: float,
source_lines_over_500: int,
sonarqube_report: dict,
supply_chain_report: dict,
supply_chain_required: bool,
) -> dict[str, str]:
"""Generate the canonical quality-check status map for dashboarding."""
raw_results = summary.get("results", []) if isinstance(summary, dict) else []
status_by_name: dict[str, str] = {}
for result in raw_results:
if not isinstance(result, dict):
continue
check_name = str(result.get("name") or "").strip().lower()
if not check_name:
continue
status_by_name[check_name] = _normalize_result_status(result.get("status"), default="failed")
tests_status = status_by_name.get("tests")
if not tests_status:
candidate_keys = ["unit", "integration", "e2e", "pytest", "test", "tests"]
candidates = [status_by_name[key] for key in candidate_keys if key in status_by_name]
if candidates:
tests_status = _combine_statuses(candidates)
elif tests["tests"] > 0:
tests_status = "ok" if (tests["failures"] + tests["errors"]) == 0 else "failed"
else:
tests_status = "not_applicable"
coverage_status = status_by_name.get("coverage")
if not coverage_status:
if workspace_line_coverage_percent > 0:
coverage_status = "ok" if workspace_line_coverage_percent >= 95.0 else "failed"
else:
coverage_status = "not_applicable"
loc_status = status_by_name.get("loc")
if not loc_status:
loc_status = "ok" if source_lines_over_500 == 0 else "failed"
docs_naming_status = status_by_name.get("docs_naming")
if not docs_naming_status:
candidates = [status_by_name[key] for key in ["docs", "hygiene", "smell", "lint", "naming"] if key in status_by_name]
docs_naming_status = _combine_statuses(candidates) if candidates else "not_applicable"
gate_glue_status = status_by_name.get("gate_glue")
if not gate_glue_status:
candidates = [status_by_name[key] for key in ["gate_glue", "glue", "gate"] if key in status_by_name]
gate_glue_status = _combine_statuses(candidates) if candidates else "not_applicable"
sonarqube_status = status_by_name.get("sonarqube") or _infer_sonarqube_status(sonarqube_report)
supply_chain_status = status_by_name.get("supply_chain") or _infer_supply_chain_status(
supply_chain_report,
required=supply_chain_required,
)
return {
"tests": tests_status,
"coverage": coverage_status,
"loc": loc_status,
"docs_naming": docs_naming_status,
"gate_glue": gate_glue_status,
"sonarqube": sonarqube_status,
"supply_chain": supply_chain_status,
}

View File

@ -0,0 +1,173 @@
"""Build a titan-iac supply-chain compliance report from Trivy evidence."""
from __future__ import annotations
import argparse
import datetime as dt
import json
from pathlib import Path
from typing import Any
FAIL_SEVERITIES = {"HIGH", "CRITICAL"}
def _read_json(path: Path) -> dict[str, Any]:
"""Read a JSON object from disk for use as pipeline evidence."""
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError(f"{path} must contain a JSON object")
return payload
def _parse_day(raw: str | None) -> dt.date | None:
"""Parse an ISO day while letting optional waiver dates stay optional."""
if not raw:
return None
return dt.date.fromisoformat(raw)
def _today(override: str | None = None) -> dt.date:
"""Return the policy day so tests can pin expiry behavior."""
return _parse_day(override) or dt.date.today()
def _load_waiver_pairs(path: Path | None, policy_day: dt.date) -> tuple[set[tuple[str, str]], int]:
"""Return active ``(misconfiguration id, target)`` waivers and expired count."""
if path is None or not path.exists():
return set(), 0
payload = _read_json(path)
default_expires_at = payload.get("default_expires_at")
active: set[tuple[str, str]] = set()
expired = 0
for entry in payload.get("misconfigurations", []):
if not isinstance(entry, dict):
continue
misconfiguration_id = str(entry.get("id") or "").strip()
if not misconfiguration_id:
continue
expires_at = _parse_day(str(entry.get("expires_at") or default_expires_at or ""))
targets = entry.get("targets", [])
if not isinstance(targets, list):
continue
if expires_at and expires_at < policy_day:
expired += len(targets)
continue
# Waivers are target-specific so a new unsafe manifest fails until it is
# either fixed or deliberately accepted with a fresh expiration.
for target in targets:
if isinstance(target, str) and target:
active.add((misconfiguration_id, target))
return active, expired
def _iter_failed_misconfigurations(payload: dict[str, Any]):
"""Yield failed high/critical Trivy misconfiguration records."""
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
target = str(result.get("Target") or "")
for item in result.get("Misconfigurations") or []:
if not isinstance(item, dict):
continue
if item.get("Status") != "FAIL":
continue
if str(item.get("Severity") or "").upper() not in FAIL_SEVERITIES:
continue
yield target, item
def _count_vulnerabilities(payload: dict[str, Any], severity: str) -> int:
"""Count Trivy vulnerabilities at a specific severity."""
count = 0
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
for item in result.get("Vulnerabilities") or []:
if isinstance(item, dict) and str(item.get("Severity") or "").upper() == severity:
count += 1
return count
def _count_secrets(payload: dict[str, Any]) -> int:
"""Count detected secrets in the Trivy filesystem report."""
count = 0
for result in payload.get("Results", []):
if isinstance(result, dict):
count += len(result.get("Secrets") or [])
return count
def build_report(
trivy_payload: dict[str, Any],
waiver_path: Path | None = None,
today_override: str | None = None,
) -> dict[str, Any]:
"""Build the compliance summary consumed by the quality gate."""
policy_day = _today(today_override)
active_waivers, expired_waivers = _load_waiver_pairs(waiver_path, policy_day)
open_misconfigs: list[dict[str, str]] = []
waived_misconfigs = 0
for target, item in _iter_failed_misconfigurations(trivy_payload):
misconfiguration_id = str(item.get("ID") or "")
if (misconfiguration_id, target) in active_waivers:
waived_misconfigs += 1
continue
open_misconfigs.append(
{
"id": misconfiguration_id,
"target": target,
"severity": str(item.get("Severity") or ""),
"title": str(item.get("Title") or ""),
}
)
critical = _count_vulnerabilities(trivy_payload, "CRITICAL")
high = _count_vulnerabilities(trivy_payload, "HIGH")
secrets = _count_secrets(trivy_payload)
status = "ok" if critical == 0 and secrets == 0 and not open_misconfigs else "failed"
return {
"status": status,
"compliant": status == "ok",
"category": "artifact_security",
"scan_type": "filesystem",
"scanner": "trivy",
"critical_vulnerabilities": critical,
"high_vulnerabilities": high,
"high_vulnerability_policy": "observe",
"secrets": secrets,
"high_or_critical_misconfigurations": len(open_misconfigs),
"waived_misconfigurations": waived_misconfigs,
"expired_waivers": expired_waivers,
"waiver_file": str(waiver_path) if waiver_path else "",
"open_misconfiguration_examples": open_misconfigs[:20],
}
def main(argv: list[str] | None = None) -> int:
"""CLI entrypoint used by Jenkins after the Trivy scan completes."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--trivy-json", required=True)
parser.add_argument("--waivers")
parser.add_argument("--output", required=True)
parser.add_argument("--today")
args = parser.parse_args(argv)
trivy_payload = _read_json(Path(args.trivy_json))
waiver_path = Path(args.waivers) if args.waivers else None
report = build_report(trivy_payload, waiver_path=waiver_path, today_override=args.today)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -0,0 +1,108 @@
"""Glue checks for Ariadne schedules exported to VictoriaMetrics."""
from __future__ import annotations
import os
from datetime import datetime, timezone
from pathlib import Path
import requests
import yaml
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
vm_url = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
response = requests.get(f"{vm_url}/api/v1/query", params={"query": promql}, timeout=10)
response.raise_for_status()
payload = response.json()
return payload.get("data", {}).get("result", [])
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_ariadne_schedule_series_exist():
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing next-run metrics for: {', '.join(missing)}"
def test_ariadne_schedule_recent_success():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-success metrics for: {', '.join(missing)}"
now = datetime.now(timezone.utc)
age_by_task = {
item.get("metric", {}).get("task"): (now - datetime.fromtimestamp(float(item["value"][1]), tz=timezone.utc)).total_seconds() / 3600
for item in series
}
too_old = [
f"{task} ({age_by_task[task]:.1f}h > {item['max_success_age_hours']}h)"
for item in tasks
if (task := item["task"]) in age_by_task and age_by_task[task] > float(item["max_success_age_hours"])
]
assert not too_old, "Ariadne schedules are stale: " + ", ".join(too_old)
def test_ariadne_schedule_last_status_present_and_boolean():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-status metrics for: {', '.join(missing)}"
invalid = []
for item in series:
task = item.get("metric", {}).get("task")
value = float(item["value"][1])
if value not in (0.0, 1.0):
invalid.append(f"{task}={value}")
assert not invalid, f"Unexpected Ariadne last-status values: {', '.join(invalid)}"

View File

@ -1,3 +1,5 @@
"""Glue checks for the metrics the quality-gate publishes."""
from __future__ import annotations
import os
@ -23,26 +25,63 @@ def _query(promql: str) -> list[dict]:
return payload.get("data", {}).get("result", [])
def test_glue_metrics_present():
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
assert series, "No glue cronjob label series found"
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def test_glue_metrics_success_join():
query = (
"kube_cronjob_status_last_successful_time "
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_ariadne_schedule_metrics_present():
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
def test_ariadne_schedule_success_and_status_metrics_present():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
success = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
status = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
success_tasks = {item.get("metric", {}).get("task") for item in success}
status_tasks = {item.get("metric", {}).get("task") for item in status}
expected = {item["task"] for item in tasks}
missing_success = sorted(expected - success_tasks)
missing_status = sorted(expected - status_tasks)
assert not missing_success, f"Missing Ariadne success metrics for: {', '.join(missing_success)}"
assert not missing_status, f"Missing Ariadne status metrics for: {', '.join(missing_status)}"

View File

@ -0,0 +1,401 @@
{
"version": 1,
"generated_from": "Jenkins titan-iac build 225 Trivy filesystem scan",
"default_expires_at": "2026-05-22",
"ticket": "atlas-quality-wave-k8s-hardening",
"default_reason": "Existing Kubernetes manifest hardening baseline accepted only for the first quality-gate rollout; fix or renew explicitly before expiry.",
"misconfigurations": [
{
"id": "DS-0002",
"targets": [
"dockerfiles/Dockerfile.ananke-node-helper"
]
},
{
"id": "KSV-0009",
"targets": [
"services/mailu/vip-controller.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml"
]
},
{
"id": "KSV-0010",
"targets": [
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0014",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/guest-register-deployment.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/actual-budget-deployment.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/deployment.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud-mail-sync/cronjob.yaml",
"services/nextcloud/collabora.yaml",
"services/nextcloud/cronjob.yaml",
"services/nextcloud/deployment.yaml",
"services/nextcloud/maintenance-cronjob.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/planka/deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vault/statefulset.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0017",
"targets": [
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0041",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"infrastructure/traefik/clusterrole.yaml",
"services/bstein-dev-home/rbac.yaml",
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/comms/mas-secrets-ensure-rbac.yaml",
"services/maintenance/soteria-rbac.yaml"
]
},
{
"id": "KSV-0047",
"targets": [
"services/monitoring/rbac.yaml"
]
},
{
"id": "KSV-0053",
"targets": [
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/ariadne-rbac.yaml"
]
},
{
"id": "KSV-0056",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/disable-k3s-traefik-rbac.yaml",
"services/maintenance/k3s-traefik-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0114",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0118",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/coredns-deployment.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud/collabora.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/sui-metrics/overlays/atlas/patch-node-selector.yaml",
"services/typhon/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0121",
"targets": [
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml"
]
}
]
}

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/ariadne
branch: main
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: feature/ariadne
branch: main
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -21,6 +21,7 @@ resources:
- sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml
- quality/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml
- jenkins/kustomization.yaml

View File

@ -0,0 +1,35 @@
# clusters/atlas/flux-system/applications/quality/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: quality
namespace: flux-system
spec:
interval: 10m
path: ./services/quality
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: quality
dependsOn:
- name: traefik
- name: cert-manager
- name: keycloak
- name: vault
- name: postgres
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: sonarqube
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: sonarqube-exporter
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: oauth2-proxy-sonarqube
namespace: quality
wait: false
timeout: 20m

View File

@ -13,6 +13,7 @@ spec:
name: flux-system
targetNamespace: climate
dependsOn:
- name: vault
- name: vault-csi
- name: monitoring
healthChecks:

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: feature/ariadne
branch: main
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: feature/ariadne
branch: main
update:
strategy: Setters
path: services/maintenance

View File

@ -2,4 +2,8 @@ FROM python:3.11-slim
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN pip install --no-cache-dir requests psycopg2-binary
RUN pip install --no-cache-dir requests psycopg2-binary \
&& groupadd --system guest-tools \
&& useradd --system --uid 65532 --gid guest-tools --home-dir /nonexistent --shell /usr/sbin/nologin guest-tools
USER guest-tools

View File

@ -1,16 +1,8 @@
FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
# Use the mirrored Harbor artifact so CI does not depend on Docker Hub egress.
FROM registry.bstein.dev/streaming/data-prepper@sha256:32ac6ad42e0f12da08bebee307e290b17d127b30def9b06eeaffbcbbc5033e83
ENV DATA_PREPPER_PATH=/usr/share/data-prepper
RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
&& mkdir -p /var/log/data-prepper
COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
USER 10001
WORKDIR /usr/share/data-prepper
CMD ["bin/data-prepper"]

View File

@ -1,10 +1,13 @@
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
RUN apk add --no-cache ca-certificates \
&& addgroup -S livekit-token \
&& adduser -S -D -H -u 65532 -G livekit-token livekit-token
COPY --from=base /lk-jwt-service /lk-jwt-service
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER livekit-token
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/lk-jwt-service"]

View File

@ -29,10 +29,12 @@ FROM ${DEBIAN_IMAGE}
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends ca-certificates; \
update-ca-certificates; rm -rf /var/lib/apt/lists/*
update-ca-certificates; rm -rf /var/lib/apt/lists/*; \
groupadd --system p2pool; \
useradd --system --uid 65532 --gid p2pool --home-dir /nonexistent --shell /usr/sbin/nologin p2pool
COPY --from=fetch /out/p2pool /usr/local/bin/p2pool
RUN /usr/local/bin/p2pool --version || true
EXPOSE 3333
USER p2pool
ENTRYPOINT ["/usr/local/bin/p2pool"]

View File

@ -26,9 +26,12 @@ RUN set -eux; \
curl -fsSL "$URL" -o /opt/monero/monero.tar.bz2; \
tar -xjf /opt/monero/monero.tar.bz2 -C /opt/monero --strip-components=1; \
install -m 0755 /opt/monero/monero-wallet-rpc /usr/local/bin/monero-wallet-rpc; \
rm -f /opt/monero/monero.tar.bz2
rm -f /opt/monero/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero
ENV PATH="/usr/local/bin:/usr/bin:/bin"
RUN /usr/local/bin/monero-wallet-rpc --version || true
EXPOSE 18083
USER monero

View File

@ -23,10 +23,14 @@ RUN set -eux; \
mkdir -p /opt/monero; \
tar -xjf /tmp/monero.tar.bz2 -C /opt/monero --strip-components=1; \
rm -f /tmp/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero; \
mkdir -p /data; \
chown monero:monero /data; \
chmod 0770 /data
ENV LD_LIBRARY_PATH=/opt/monero:/opt/monero/lib \
PATH="/opt/monero:${PATH}"
USER monero
CMD ["/opt/monero/monerod", "--version"]

View File

@ -1,10 +1,13 @@
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
RUN apk add --no-cache ca-certificates \
&& addgroup -S oauth2-proxy \
&& adduser -S -D -H -u 65532 -G oauth2-proxy oauth2-proxy
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER oauth2-proxy
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/bin/oauth2-proxy"]

View File

@ -1,10 +1,13 @@
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
RUN apk add --no-cache ca-certificates \
&& addgroup -S pegasus \
&& adduser -S -D -H -u 65532 -G pegasus pegasus
COPY --from=base /pegasus /pegasus
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER pegasus
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/pegasus"]

View File

@ -0,0 +1,48 @@
# dockerfiles/Dockerfile.quality-tools
FROM debian:bookworm-slim
ARG SONAR_SCANNER_VERSION=8.0.1.6346
ARG TRIVY_VERSION=0.70.0
ENV TRIVY_CACHE_DIR=/opt/trivy-cache
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
git \
jq \
unzip \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd --system quality-tools \
&& useradd --system --uid 65532 --gid quality-tools --home-dir /nonexistent --shell /usr/sbin/nologin quality-tools
RUN set -eux; \
scanner_zip="sonar-scanner-cli-${SONAR_SCANNER_VERSION}-linux-aarch64.zip"; \
base_url="https://binaries.sonarsource.com/Distribution/sonar-scanner-cli"; \
curl -fsSL "${base_url}/${scanner_zip}" -o "/tmp/${scanner_zip}"; \
curl -fsSL "${base_url}/${scanner_zip}.sha256" -o "/tmp/${scanner_zip}.sha256"; \
printf '%s %s\n' "$(cat "/tmp/${scanner_zip}.sha256")" "/tmp/${scanner_zip}" | sha256sum -c -; \
unzip -q "/tmp/${scanner_zip}" -d /opt; \
ln -s "/opt/sonar-scanner-${SONAR_SCANNER_VERSION}-linux-aarch64/bin/sonar-scanner" /usr/local/bin/sonar-scanner; \
rm -f "/tmp/${scanner_zip}" "/tmp/${scanner_zip}.sha256"
RUN set -eux; \
trivy_tgz="trivy_${TRIVY_VERSION}_Linux-ARM64.tar.gz"; \
curl -fsSL "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/${trivy_tgz}" -o "/tmp/${trivy_tgz}"; \
tar -C /usr/local/bin -xzf "/tmp/${trivy_tgz}" trivy; \
rm -f "/tmp/${trivy_tgz}"; \
trivy --version; \
sonar-scanner -v
RUN set -eux; \
mkdir -p "${TRIVY_CACHE_DIR}"; \
trivy image --download-db-only --cache-dir "${TRIVY_CACHE_DIR}"; \
chmod -R a+rX "${TRIVY_CACHE_DIR}"; \
mkdir -p /workspace; \
chown quality-tools:quality-tools /workspace
WORKDIR /workspace
USER quality-tools

View File

@ -33,6 +33,36 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -46,6 +76,36 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -59,6 +119,36 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

View File

@ -26,6 +26,9 @@ spec:
cleanupOnFail: true
timeout: 15m
values:
global:
nodeSelector:
longhorn-host: "true"
service:
ui:
type: NodePort
@ -78,3 +81,12 @@ spec:
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always
longhornManager:
nodeSelector:
longhorn-host: "true"
longhornDriver:
nodeSelector:
longhorn-host: "true"
longhornUI:
nodeSelector:
longhorn-host: "true"

View File

@ -2,10 +2,11 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-4
name: longhorn-settings-ensure-7
namespace: longhorn-system
spec:
backoffLimit: 0
activeDeadlineSeconds: 240
ttlSecondsAfterFinished: 3600
template:
spec:

View File

@ -4,11 +4,12 @@ set -eu
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
curl_opts="-fsS --connect-timeout 3 --max-time 15"
wait_for_api() {
attempts=30
while [ "${attempts}" -gt 0 ]; do
if curl -fsS "${api_base}" >/dev/null 2>&1; then
if curl ${curl_opts} "${api_base}" >/dev/null 2>&1; then
return 0
fi
attempts=$((attempts - 1))
@ -22,14 +23,14 @@ update_setting() {
name="$1"
value="$2"
current="$(curl -fsS "${api_base}/${name}" || true)"
current="$(curl ${curl_opts} "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} already set."
return 0
fi
echo "Setting ${name} -> ${value}"
curl -fsS -X PUT \
curl ${curl_opts} -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}" >/dev/null
@ -40,3 +41,7 @@ update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
# Keep storage-heavy nodes from getting hammered by rebuild storms and skew.
update_setting replica-auto-balance "best-effort"
update_setting concurrent-replica-rebuild-per-node-limit "2"
update_setting node-down-pod-deletion-policy "delete-both-statefulset-and-deployment-pod"

View File

@ -13,9 +13,27 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "longhorn-backup-b2__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn-backup-b2__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn-backup-b2__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects:
- secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn-backup-b2__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn-backup-b2__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn-backup-b2__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -26,6 +26,16 @@ spec:
- key: hardware
operator: In
values: ["rpi5", "rpi4"]
- weight: 90
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
containers:
- name: sync
image: alpine:3.20

View File

@ -70,6 +70,38 @@ items:
dnsPolicy: ClusterFirst
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: atlas-traefik-ingress-controller

View File

@ -41,3 +41,12 @@ spec:
failurePolicy: Ignore
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]

File diff suppressed because it is too large Load Diff

View File

@ -4,13 +4,21 @@ import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
spec = importlib.util.spec_from_file_location("scripts.dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def flatten_panels(panels):
flat = []
for panel in panels:
flat.append(panel)
flat.extend(panel.get("panels", []))
return flat
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
@ -56,3 +64,71 @@ def test_render_configmap_writes(tmp_path):
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content
def test_testing_suite_variable_uses_canonical_values_only():
mod = load_module()
variable = mod.testing_suite_variable()
canonical_matcher = "|".join(mod.PLATFORM_TEST_SUITE_NAMES)
legacy_names = {"bstein-home", "data-prepper", "titan-iac", "pegasus-health"}
assert variable["allValue"] == canonical_matcher
assert not any(alias in variable["query"] for alias in legacy_names)
assert not any(alias in variable["allValue"] for alias in legacy_names)
assert [option["value"] for option in variable["options"]] == mod.PLATFORM_TEST_SUITE_NAMES
def test_jobs_dashboard_separates_current_gate_health_from_reliability():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])}
assert "Current Gate Health by Suite" in panels_by_title
assert "Run Reliability by Suite (24h)" in panels_by_title
assert "Run Reliability History by Suite" in panels_by_title
assert "Failures by Suite (24h)" not in panels_by_title
assert "Success Rate by Suite (24h)" not in panels_by_title
current_gate_expr = panels_by_title["Current Gate Health by Suite"]["targets"][0]["expr"]
assert 'check)' in current_gate_expr
assert 'result=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
reliability_panel = panels_by_title["Run Reliability by Suite (24h)"]
reliability_expr = reliability_panel["targets"][0]["expr"]
assert "platform_quality_gate_runs_total" in reliability_expr
assert "> 0" in reliability_expr
assert "- 1" in reliability_expr
assert reliability_panel["fieldConfig"]["defaults"]["mappings"] == [
{"type": "value", "options": {"-1": {"text": "no runs"}}}
]
def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels = dashboard["panels"]
rows = [panel for panel in panels if panel["type"] == "row"]
visible_query_panels = [panel for panel in panels if panel["type"] != "row"]
nested_panels_by_title = {
child["title"]: child
for row in rows
for child in row.get("panels", [])
}
assert len(panels) == 16
assert len(visible_query_panels) == 11
assert sum(len(panel.get("targets", [])) for panel in visible_query_panels) == 11
assert [row["title"] for row in rows] == [
"Reliability And Run History",
"Failure Trends By Check",
"Success Trends By Check",
"Test Drilldowns And Problem Tests",
"Telemetry Completeness, SonarQube, And Branches",
]
assert all(row["collapsed"] for row in rows)
assert "Failure Trend: Coverage" in nested_panels_by_title
assert "Success Trend: Supply Chain" in nested_panels_by_title
assert "Selected Test Pass Rate History" in nested_panels_by_title
assert "Missing Coverage Metrics by Suite" in nested_panels_by_title
assert "SonarQube API Up" in nested_panels_by_title

View File

@ -138,6 +138,100 @@ def test_kc_get_users_paginates(monkeypatch):
assert sync.SESSION.calls == 1
def test_kc_get_users_fetches_second_page_after_full_batch(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
self.first_params = []
def get(self, *_, **kwargs):
self.calls += 1
self.first_params.append(kwargs["params"]["first"])
if self.calls == 1:
return _FakeResponse([{"id": f"u{i}"} for i in range(200)])
return _FakeResponse([{"id": "last"}])
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert len(users) == 201
assert sync.SESSION.first_params == [0, 200]
def test_get_kc_token_posts_client_credentials(monkeypatch):
sync = load_sync_module(monkeypatch)
calls = []
class _TokenSession:
def post(self, url, data, timeout):
calls.append((url, data, timeout))
return _FakeResponse({"access_token": "tok"})
sync.SESSION = _TokenSession()
assert sync.get_kc_token() == "tok"
assert calls[0][1]["grant_type"] == "client_credentials"
def test_retry_request_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _flaky():
attempts.append(1)
if len(attempts) == 1:
raise sync.requests.RequestException("temporary")
return "ok"
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_request("request", _flaky, attempts=2) == "ok"
assert sleeps == [2]
def test_retry_request_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.requests.RequestException):
sync.retry_request(
"request",
lambda: (_ for _ in ()).throw(sync.requests.RequestException("nope")),
attempts=1,
)
def test_retry_db_connect_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _connect(**kwargs):
attempts.append(kwargs)
if len(attempts) == 1:
raise sync.psycopg2.Error("not yet")
return "conn"
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_db_connect(attempts=2) == "conn"
assert sleeps == [2]
def test_retry_db_connect_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.psycopg2, "connect", lambda **kwargs: (_ for _ in ()).throw(sync.psycopg2.Error("down")))
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.psycopg2.Error):
sync.retry_db_connect(attempts=1)
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
@ -166,6 +260,87 @@ def test_ensure_mailu_user_upserts(monkeypatch):
assert captured["password"] != "pw"
def test_attribute_and_email_helpers(monkeypatch):
sync = load_sync_module(monkeypatch)
assert sync.get_attribute_value({"x": ["first", "second"]}, "x") == "first"
assert sync.get_attribute_value({"x": []}, "x") is None
assert sync.get_attribute_value({"x": "value"}, "x") == "value"
assert sync.mailu_enabled({"mailu_email": ["legacy@example.com"]}) is True
assert sync.mailu_enabled({"mailu_enabled": ["off"]}) is False
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@example.com"}, {}) == "user@example.com"
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@other.com"}, {}) == "fallback@example.com"
def test_safe_update_payload_filters_fields(monkeypatch):
sync = load_sync_module(monkeypatch)
payload = sync._safe_update_payload(
{
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD", 7],
"attributes": "not-a-dict",
"ignored": "value",
}
)
assert payload == {
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD"],
"attributes": {},
}
def test_ensure_system_mailboxes_handles_configurations(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
ensured = []
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", ["postmaster@example.com", "abuse"])
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "")
sync.ensure_system_mailboxes(object())
assert "MAILU_SYSTEM_PASSWORD is missing" in capsys.readouterr().out
def _ensure(cursor, email, password, display_name):
ensured.append((email, password, display_name))
if email == "abuse":
raise RuntimeError("boom")
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "pw")
monkeypatch.setattr(sync, "ensure_mailu_user", _ensure)
sync.ensure_system_mailboxes(object())
out = capsys.readouterr().out
assert ensured == [
("postmaster@example.com", "pw", "postmaster"),
("abuse", "pw", "abuse"),
]
assert "Ensured system mailbox for postmaster@example.com" in out
assert "Failed to ensure system mailbox abuse" in out
def test_main_exits_without_users_or_system_mailboxes(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", [])
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: [])
sync.main()
assert "No users found; exiting." in capsys.readouterr().out
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")

View File

@ -0,0 +1,134 @@
import importlib.util
import io
import pathlib
import types
def load_listener_module(monkeypatch):
monkeypatch.setenv("MAILU_SYNC_WAIT_TIMEOUT_SEC", "0")
module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
/ "mailu"
/ "scripts"
/ "mailu_sync_listener.py"
)
spec = importlib.util.spec_from_file_location("mailu_sync_listener_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def _handler_for(listener, body):
handler = listener.Handler.__new__(listener.Handler)
raw = body if isinstance(body, bytes) else body.encode()
handler.headers = {"Content-Length": str(len(raw))}
handler.rfile = io.BytesIO(raw)
handler.responses = []
handler.headers_ended = 0
handler.send_response = lambda code: handler.responses.append(code)
handler.end_headers = lambda: setattr(handler, "headers_ended", handler.headers_ended + 1)
return handler
def test_listener_run_sync_blocking_updates_state(monkeypatch):
listener = load_listener_module(monkeypatch)
monkeypatch.setattr(listener, "time", lambda: 42.0)
monkeypatch.setattr(
listener.subprocess,
"run",
lambda command, check: types.SimpleNamespace(returncode=3),
)
assert listener._run_sync_blocking() == 3
assert listener.last_rc == 3
assert listener.last_run == 42.0
assert listener.sync_done.is_set()
listener.sync_running = True
assert listener._run_sync_blocking() == 0
def test_listener_trigger_sync_async_honors_running_and_debounce(monkeypatch):
listener = load_listener_module(monkeypatch)
starts = []
class _Thread:
def __init__(self, target, daemon):
self.target = target
self.daemon = daemon
def start(self):
starts.append((self.target, self.daemon))
monkeypatch.setattr(listener.threading, "Thread", _Thread)
monkeypatch.setattr(listener, "time", lambda: 100.0)
listener.sync_running = True
assert listener._trigger_sync_async() is False
listener.sync_running = False
listener.last_run = 95.0
assert listener._trigger_sync_async() is False
assert listener._trigger_sync_async(force=True) is True
assert starts and starts[0][1] is True
def test_listener_post_rejects_invalid_json(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = _handler_for(listener, b"{not-json")
handler.do_POST()
assert handler.responses == [400]
assert handler.headers_ended == 1
def test_listener_post_triggers_async_without_wait(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
handler = _handler_for(listener, '{"force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [202]
def test_listener_post_wait_returns_success_or_failure(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
listener.sync_running = False
listener.last_rc = 0
handler = _handler_for(listener, '{"wait": true, "force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [200]
listener.last_rc = 2
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [500]
def test_listener_post_wait_keeps_running_request_successful(monkeypatch):
listener = load_listener_module(monkeypatch)
listener.sync_running = True
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [200]
def test_listener_log_message_is_quiet(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = listener.Handler.__new__(listener.Handler)
assert handler.log_message("ignored %s", "value") is None

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
MODE="${1:-dry-run}"
if [[ "$MODE" != "dry-run" && "$MODE" != "active" ]]; then
echo "usage: $0 [dry-run|active]" >&2
exit 2
fi
EXPECTED_DRY_RUN="true"
PROM_MODE="dry_run"
if [[ "$MODE" == "active" ]]; then
EXPECTED_DRY_RUN="false"
PROM_MODE="delete"
fi
KUSTOMIZATION="${KUSTOMIZATION:-maintenance}"
NAMESPACE="${NAMESPACE:-maintenance}"
DEPLOYMENT="${DEPLOYMENT:-ariadne}"
LOCAL_METRICS_PORT="${LOCAL_METRICS_PORT:-18080}"
for cmd in flux kubectl curl grep awk; do
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "missing required command: $cmd" >&2
exit 2
fi
done
echo "[1/5] reconcile Flux kustomization: ${KUSTOMIZATION}"
flux reconcile kustomization "$KUSTOMIZATION" --namespace flux-system --with-source
echo "[2/5] wait for deployment rollout"
kubectl -n "$NAMESPACE" rollout status "deployment/$DEPLOYMENT" --timeout=5m
echo "[3/5] verify ariadne env wiring"
ENV_DUMP="$(kubectl -n "$NAMESPACE" get deployment "$DEPLOYMENT" -o jsonpath='{range .spec.template.spec.containers[0].env[*]}{.name}={.value}{"\n"}{end}')"
echo "$ENV_DUMP" | grep -F "ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP=45 */6 * * *"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_NAMESPACE=jenkins"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_PVC_PREFIX=pvc-workspace-"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS=24"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_DRY_RUN=${EXPECTED_DRY_RUN}"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN=20"
echo "[4/5] scrape /metrics and confirm cleanup metrics are exported"
PF_LOG="$(mktemp)"
METRICS_FILE="$(mktemp)"
cleanup() {
if [[ -n "${PF_PID:-}" ]]; then
kill "$PF_PID" >/dev/null 2>&1 || true
wait "$PF_PID" 2>/dev/null || true
fi
rm -f "$PF_LOG" "$METRICS_FILE"
}
trap cleanup EXIT
kubectl -n "$NAMESPACE" port-forward "deployment/$DEPLOYMENT" "${LOCAL_METRICS_PORT}:8080" >"$PF_LOG" 2>&1 &
PF_PID=$!
sleep 2
curl -fsS "http://127.0.0.1:${LOCAL_METRICS_PORT}/metrics" >"$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_objects_total" "$METRICS_FILE"
echo "[5/5] show recent cleanup signal"
if grep -q "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"; then
grep "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE" | grep "mode=\"${PROM_MODE}\"" || true
else
echo "No run counter sample yet for mode=${PROM_MODE}; wait for schedule window and re-run." >&2
fi
echo "Recent cleanup logs (if any):"
kubectl -n "$NAMESPACE" logs "deployment/$DEPLOYMENT" --tail=500 | grep -i "jenkins workspace cleanup" | tail -n 20 || true
echo "verification complete for mode=${MODE}"

View File

@ -5,7 +5,7 @@ metadata:
name: ollama
namespace: ai
spec:
replicas: 1
replicas: 0
revisionHistoryLimit: 2
strategy:
type: RollingUpdate
@ -21,7 +21,7 @@ spec:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-22/24)
ai.bstein.dev/gpu: GPU pool (titan-20/21)
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
spec:
affinity:
@ -32,13 +32,13 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- titan-22
- titan-24
- titan-20
- titan-21
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models
claimName: ollama-models-asteria
initContainers:
- name: warm-model
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d

View File

@ -2,12 +2,12 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-models
name: ollama-models-asteria
namespace: ai
spec:
accessModes:
- ReadWriteOnce
- ReadWriteMany
resources:
requests:
storage: 30Gi
storageClassName: astreae
storageClassName: asteria

View File

@ -49,6 +49,15 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]
imagePullSecrets:
- name: harbor-regcred
containers:

View File

@ -38,6 +38,36 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: gateway
image: python:3.11-slim

View File

@ -26,7 +26,7 @@ spec:
imagePullPolicy: Always
ports:
- name: http
containerPort: 80
containerPort: 8080
readinessProbe:
httpGet:
path: /

View File

@ -10,4 +10,4 @@ spec:
ports:
- name: http
port: 80
targetPort: 80
targetPort: 8080

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
newTag: 0.1.1-267 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
newTag: 0.1.1-267 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home

View File

@ -53,7 +53,7 @@ spec:
registry:
existingClaim: harbor-registry
accessMode: ReadWriteOnce
size: 50Gi
size: 100Gi
jobservice:
jobLog:
existingClaim: harbor-jobservice-logs
@ -77,6 +77,7 @@ spec:
internal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
@ -113,6 +114,7 @@ spec:
core:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
@ -125,6 +127,10 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-core-env.sh: "kv/data/atlas/harbor/harbor-core"
vault.hashicorp.com/agent-inject-template-harbor-core-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -174,6 +180,7 @@ spec:
jobservice:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
@ -183,6 +190,10 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-jobservice-env.sh: "kv/data/atlas/harbor/harbor-jobservice"
vault.hashicorp.com/agent-inject-template-harbor-jobservice-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -216,6 +227,7 @@ spec:
portal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
@ -243,6 +255,7 @@ spec:
registry:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
registry:
image:
repository: registry.bstein.dev/infra/harbor-registry
@ -270,6 +283,10 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-registry-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registry-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-registry" }}
@ -321,6 +338,7 @@ spec:
nginx:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}

View File

@ -8,7 +8,7 @@ spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 50Gi
storage: 100Gi
storageClassName: astreae
---
apiVersion: v1

View File

@ -77,23 +77,26 @@ spec:
mountPath: /config
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: longhorn-host
operator: In
values:
- "true"
- key: node-role.kubernetes.io/worker
operator: In
values:
- "true"
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
- key: hardware
operator: In
values:
- titan-22
- rpi5
- weight: 80
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
@ -105,7 +108,6 @@ spec:
fsGroup: 65532
fsGroupChangePolicy: OnRootMismatch
runAsGroup: 65532
runtimeClassName: nvidia
containers:
- name: jellyfin
image: docker.io/jellyfin/jellyfin:10.11.5
@ -118,8 +120,6 @@ spec:
- name: http
containerPort: 8096
env:
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,video,utility"
- name: JELLYFIN_PublishedServerUrl
value: "https://stream.bstein.dev"
- name: PUID
@ -131,12 +131,7 @@ spec:
- name: VAULT_COPY_FILES
value: /vault/secrets/ldap-config.xml:/config/plugins/configurations/LDAP-Auth.xml
resources:
limits:
nvidia.com/gpu.shared: 1
# cpu: "4"
# memory: 8Gi
requests:
nvidia.com/gpu.shared: 1
cpu: "500m"
memory: 1Gi
volumeMounts:

View File

@ -45,6 +45,17 @@ data:
username: "${HARBOR_ROBOT_USERNAME}"
password: "${HARBOR_ROBOT_PASSWORD}"
description: "Harbor robot for pipelines"
- usernamePassword:
scope: GLOBAL
id: harbor-robot-streaming
username: "${HARBOR_STREAMING_ROBOT_USERNAME}"
password: "${HARBOR_STREAMING_ROBOT_PASSWORD}"
description: "Harbor robot for streaming pushes"
- string:
scope: GLOBAL
id: sonarqube-token
secret: "${SONARQUBE_TOKEN}"
description: "SonarQube token for quality-gate evidence collection"
jobs.yaml: |
jobs:
- script: |
@ -203,6 +214,32 @@ data:
}
}
}
pipelineJob('arcanagon') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/arcanagon.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('pegasus') {
properties {
pipelineTriggers {
@ -333,6 +370,32 @@ data:
}
}
}
pipelineJob('typhon') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/typhon.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
multibranchPipelineJob('titan-iac-quality-gate') {
branchSources {
branchSource {
@ -399,8 +462,10 @@ data:
- name: "default"
namespace: "jenkins"
workspaceVolume:
emptyDirWorkspaceVolume:
memory: false
dynamicPVC:
accessModes: "ReadWriteOnce"
requestsSize: "20Gi"
storageClassName: "astreae"
containers:
- name: "jnlp"
args: "^${computer.jnlpmac} ^${computer.name}"
@ -418,11 +483,45 @@ data:
workingDir: /home/jenkins/agent
idleMinutes: 0
instanceCap: 2147483647
label: "jenkins-jenkins-agent"
label: "jenkins-jenkins-agent "
nodeUsageMode: "NORMAL"
podRetention: Never
serviceAccount: "jenkins"
slaveConnectTimeoutStr: "100"
yaml: |
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 85
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
yamlMergeStrategy: override
inheritYamlMergeStrategy: false
slaveAgentPort: 50000

View File

@ -33,22 +33,35 @@ spec:
{{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
HARBOR_STREAMING_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_STREAMING_ROBOT_PASSWORD={{ .Data.data.password }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/harbor-streaming-robot-creds" }}
HARBOR_STREAMING_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_STREAMING_ROBOT_PASSWORD={{ .Data.data.password }}
{{ end }}
{{ with secret "kv/data/atlas/shared/harbor-pull" }}
{{- if and .Data.data.username .Data.data.password }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
HARBOR_PULL_USERNAME={{ .Data.data.username }}
HARBOR_PULL_PASSWORD={{ .Data.data.password }}
{{- end }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
GITEA_PAT_USERNAME={{ .Data.data.username }}
GITEA_PAT_TOKEN={{ .Data.data.token }}
{{ end }}
{{ with secret "kv/data/atlas/quality/sonarqube-oidc" }}
SONARQUBE_TOKEN={{ .Data.data.sonarqube_exporter_token }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }}
bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
{{ with secret "kv/data/atlas/jenkins/ariadne-api" }}
ARIADNE_JENKINS_API_USER={{ .Data.data.username }}
ARIADNE_JENKINS_API_TOKEN={{ .Data.data.token }}
{{ end }}
bstein.dev/restarted-at: "2026-04-13T06:35:00Z"
spec:
serviceAccountName: jenkins
nodeSelector:
@ -57,6 +70,21 @@ spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
@ -75,6 +103,7 @@ spec:
- sso.bstein.dev
securityContext:
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
initContainers:
- name: install-plugins
image: jenkins/jenkins:2.528.3-jdk21
@ -151,7 +180,8 @@ spec:
port: http
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 20
timeoutSeconds: 5
failureThreshold: 60
volumeMounts:
- name: jenkins-home
mountPath: /var/jenkins_home

View File

@ -22,6 +22,7 @@ configMapGenerator:
- name: jenkins-init-scripts
namespace: jenkins
files:
- ariadne-api-user.groovy=scripts/ariadne-api-user.groovy
- git-notify-token.groovy=scripts/git-notify-token.groovy
- theme.groovy=scripts/theme.groovy
options:

View File

@ -0,0 +1,96 @@
import hudson.model.User
import jenkins.security.ApiTokenProperty
def userId = (System.getenv("ARIADNE_JENKINS_API_USER") ?: "").trim()
def envTokenValue = (System.getenv("ARIADNE_JENKINS_API_TOKEN") ?: "").trim()
def tokenName = "ariadne-weather"
def tokenFile = new File("/var/jenkins_home/secrets/ariadne-api-token")
def userFile = new File("/var/jenkins_home/secrets/ariadne-api-user")
def persistedTokenValue = tokenFile.exists() ? (tokenFile.text ?: "").trim() : ""
def tokenValue = envTokenValue ?: persistedTokenValue
if (!userId || !tokenValue) {
println("Ariadne API user bootstrap skipped: missing ARIADNE_JENKINS_API_USER and no token source available")
return
}
def user = User.getById(userId, true)
if (user == null) {
println("Ariadne API user bootstrap failed: unable to resolve user ${userId}")
return
}
if (!user.getFullName() || user.getFullName().trim() == userId) {
user.setFullName("Ariadne Metrics")
}
def prop = user.getProperty(ApiTokenProperty.class)
if (prop == null) {
prop = new ApiTokenProperty()
user.addProperty(prop)
}
if (persistedTokenValue && prop.matchesPassword(persistedTokenValue)) {
tokenValue = persistedTokenValue
}
if (!prop.matchesPassword(tokenValue)) {
def store = prop.getTokenStore()
boolean configured = false
try {
def existing = store.getTokenListSortedByName().find { token ->
try {
token.getName() == tokenName
} catch (Throwable ignored) {
false
}
}
if (existing != null) {
try {
store.revokeToken(existing.getUuid())
} catch (Throwable ignored) {
try {
store.revokeToken(existing.uuid)
} catch (Throwable ignoredAgain) {
println("Ariadne API user bootstrap warning: failed to revoke existing token ${tokenName}")
}
}
}
store.addFixedNewToken(tokenName, tokenValue)
configured = true
} catch (Throwable ignored) {
// Fallback for older token-store variants.
}
if (!configured) {
if (persistedTokenValue && prop.matchesPassword(persistedTokenValue)) {
tokenValue = persistedTokenValue
} else {
def generated = store.generateNewToken(tokenName)
if (generated?.plainValue) {
tokenValue = generated.plainValue
}
println("Ariadne API user bootstrap warning: addFixedNewToken unavailable, generated replacement token")
}
}
}
tokenFile.parentFile?.mkdirs()
tokenFile.text = tokenValue + "\n"
tokenFile.setReadable(false, false)
tokenFile.setReadable(true, true)
tokenFile.setWritable(false, false)
tokenFile.setWritable(true, true)
userFile.parentFile?.mkdirs()
userFile.text = userId + "\n"
userFile.setReadable(false, false)
userFile.setReadable(true, true)
userFile.setWritable(false, false)
userFile.setWritable(true, true)
user.save()
println("Ariadne API user bootstrap complete for ${userId}")

View File

@ -35,6 +35,9 @@ subjects:
- kind: ServiceAccount
name: jenkins
namespace: jenkins
- kind: ServiceAccount
name: default
namespace: jenkins
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
@ -60,6 +63,9 @@ subjects:
- kind: ServiceAccount
name: jenkins
namespace: jenkins
- kind: ServiceAccount
name: default
namespace: jenkins
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole

View File

@ -18,6 +18,15 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]
containers:
- name: sync
image: alpine:3.20

View File

@ -24,7 +24,9 @@ resources:
- oneoffs/logs-oidc-secret-ensure-job.yaml
- oneoffs/metis-oidc-secret-ensure-job.yaml
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/quality-oidc-secret-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/metis-node-passwords-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
- oneoffs/vault-oidc-secret-ensure-job.yaml
- oneoffs/actual-oidc-secret-ensure-job.yaml

View File

@ -0,0 +1,110 @@
# services/keycloak/oneoffs/metis-node-passwords-secret-ensure-job.yaml
# One-off job for sso/metis-node-passwords-secret-ensure-4.
# Purpose: ensure per-node Metis recovery placeholders exist in Vault.
# Atlas/root values are preserved while intranet IPs are standardized per node.
apiVersion: batch/v1
kind: Job
metadata:
name: metis-node-passwords-secret-ensure-4
namespace: sso
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: registry.bstein.dev/bstein/kubectl:1.35.0
command: ["/bin/sh", "-c"]
args:
- |
set -eu
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" "${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
ensured=0
while read -r node intranet_ip; do
if [ -z "${node}" ] || [ -z "${intranet_ip}" ]; then
continue
fi
secret_path="kv/data/atlas/nodes/${node}"
read_status="$(curl -sS -o /tmp/node-read.json -w "%{http_code}" -H "X-Vault-Token: ${vault_token}" "${vault_addr}/v1/${secret_path}" || true)"
if [ "${read_status}" = "200" ]; then
atlas_password="$(jq -r '.data.data.atlas_password // empty' /tmp/node-read.json)"
root_password="$(jq -r '.data.data.root_password // empty' /tmp/node-read.json)"
elif [ "${read_status}" = "404" ]; then
atlas_password=""
root_password=""
else
echo "Vault read failed for ${node} (status ${read_status})" >&2
cat /tmp/node-read.json >&2 || true
exit 1
fi
payload="$(jq -nc --arg atlas_password "${atlas_password}" --arg root_password "${root_password}" --arg intranet_ip "${intranet_ip}" '{data:{atlas_password:$atlas_password,root_password:$root_password,intranet_ip:$intranet_ip}}')"
write_status="$(curl -sS -o /tmp/node-write.json -w "%{http_code}" -X POST -H "X-Vault-Token: ${vault_token}" -H 'Content-Type: application/json' -d "${payload}" "${vault_addr}/v1/${secret_path}")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed for ${node} (status ${write_status})" >&2
cat /tmp/node-write.json >&2 || true
exit 1
fi
ensured=$((ensured + 1))
echo "Ensured node secret placeholder for ${node} (${intranet_ip})"
done <<'EOF_NODES'
titan-jh 192.168.22.8
titan-db 192.168.22.10
titan-0a 192.168.22.11
titan-0b 192.168.22.12
titan-0c 192.168.22.13
titan-20 192.168.22.20
titan-21 192.168.22.21
titan-22 192.168.22.22
titan-23 192.168.22.23
titan-24 192.168.22.26
titan-04 192.168.22.30
titan-05 192.168.22.31
titan-06 192.168.22.32
titan-07 192.168.22.33
titan-08 192.168.22.34
titan-09 192.168.22.35
titan-10 192.168.22.36
titan-11 192.168.22.37
titan-12 192.168.22.40
titan-13 192.168.22.41
titan-14 192.168.22.42
titan-15 192.168.22.43
titan-16 192.168.22.44
titan-17 192.168.22.45
titan-18 192.168.22.46
titan-19 192.168.22.47
EOF_NODES
echo "Ensured ${ensured} Metis node placeholders in Vault"

View File

@ -73,7 +73,7 @@ spec:
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
create_payload='{"clientId":"metis","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://sentinel.bstein.dev/oauth2/callback"],"webOrigins":["https://sentinel.bstein.dev"],"rootUrl":"https://sentinel.bstein.dev","baseUrl":"/"}'
create_payload='{"clientId":"metis","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://recovery.bstein.dev/oauth2/callback"],"webOrigins":["https://recovery.bstein.dev"],"rootUrl":"https://recovery.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
@ -121,7 +121,7 @@ spec:
fi
fi
update_payload='{"enabled":true,"clientId":"metis","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://sentinel.bstein.dev/oauth2/callback"],"webOrigins":["https://sentinel.bstein.dev"],"rootUrl":"https://sentinel.bstein.dev","baseUrl":"/"}'
update_payload='{"enabled":true,"clientId":"metis","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://recovery.bstein.dev/oauth2/callback"],"webOrigins":["https://recovery.bstein.dev"],"rootUrl":"https://recovery.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \

View File

@ -0,0 +1,198 @@
# services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml
# One-off job for sso/quality-oidc-secret-ensure-1.
# Purpose: ensure the SonarQube oauth2-proxy OIDC client and Vault secret exist.
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
apiVersion: batch/v1
kind: Job
metadata:
name: quality-oidc-secret-ensure-1
namespace: sso
spec:
backoffLimit: 0
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
. /vault/secrets/keycloak-admin-env.sh
KC_URL="http://keycloak.sso.svc.cluster.local"
ACCESS_TOKEN=""
for attempt in 1 2 3 4 5; do
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
-H 'Content-Type: application/x-www-form-urlencoded' \
-d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KEYCLOAK_ADMIN}" \
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
break
fi
echo "Keycloak token request failed (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "Failed to fetch Keycloak admin token" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=sonarqube" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
create_payload='{"clientId":"sonarqube","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://quality.bstein.dev/oauth2/callback"],"webOrigins":["https://quality.bstein.dev"],"rootUrl":"https://quality.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${create_payload}" \
"$KC_URL/admin/realms/atlas/clients")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client create failed (status ${status})" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=sonarqube" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
fi
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
echo "Keycloak client sonarqube not found" >&2
exit 1
fi
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
echo "Keycloak client scope groups not found" >&2
exit 1
fi
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Failed to attach groups client scope to sonarqube (status ${status})" >&2
exit 1
fi
fi
fi
update_payload='{"enabled":true,"clientId":"sonarqube","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://quality.bstein.dev/oauth2/callback"],"webOrigins":["https://quality.bstein.dev"],"rootUrl":"https://quality.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${update_payload}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
if [ "$status" != "204" ]; then
echo "Keycloak client update failed (status ${status})" >&2
exit 1
fi
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
echo "Keycloak client secret not found" >&2
exit 1
fi
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_status="$(curl -sS -o /tmp/sonarqube-oidc-read.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/quality/sonarqube-oidc" || true)"
COOKIE_SECRET=""
if [ "${read_status}" = "200" ]; then
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/sonarqube-oidc-read.json)"
elif [ "${read_status}" != "404" ]; then
echo "Vault read failed (status ${read_status})" >&2
cat /tmp/sonarqube-oidc-read.json >&2 || true
exit 1
fi
if [ -n "${COOKIE_SECRET}" ]; then
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
COOKIE_SECRET=""
fi
fi
if [ -z "${COOKIE_SECRET}" ]; then
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
fi
payload="$(jq -nc \
--arg client_id "sonarqube" \
--arg client_secret "${CLIENT_SECRET}" \
--arg cookie_secret "${COOKIE_SECRET}" \
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
write_status="$(curl -sS -o /tmp/sonarqube-oidc-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/quality/sonarqube-oidc")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed (status ${write_status})" >&2
cat /tmp/sonarqube-oidc-write.json >&2 || true
exit 1
fi
verify_status="$(curl -sS -o /tmp/sonarqube-oidc-verify.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/quality/sonarqube-oidc" || true)"
if [ "${verify_status}" != "200" ]; then
echo "Vault verify failed (status ${verify_status})" >&2
cat /tmp/sonarqube-oidc-verify.json >&2 || true
exit 1
fi
echo "SonarQube OIDC secret ready in Vault"

View File

@ -8,7 +8,6 @@ spec:
restartPolicy: Never
serviceAccountName: jenkins
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
containers:
- name: git
@ -16,6 +15,11 @@ spec:
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
command:
- cat
tty: true
- name: kaniko
image: gcr.io/kaniko-project/executor:v1.23.2-debug
command:
@ -23,7 +27,7 @@ spec:
tty: true
resources:
requests:
cpu: "500m"
cpu: "100m"
memory: "1Gi"
limits:
cpu: "1500m"
@ -32,15 +36,26 @@ spec:
}
}
environment {
SUITE_NAME = 'data-prepper'
SUITE_NAME = 'data_prepper'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'data_prepper'
SONARQUBE_TOKEN = credentials('sonarqube-token')
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '1'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
parameters {
string(name: 'HARBOR_REPO', defaultValue: 'registry.bstein.dev/monitoring/data-prepper', description: 'Docker repository for Data Prepper')
string(name: 'HARBOR_REPO', defaultValue: 'registry.bstein.dev/streaming/data-prepper', description: 'Docker repository for Data Prepper')
string(name: 'IMAGE_TAG', defaultValue: '2.8.0', description: 'Image tag to publish')
booleanParam(name: 'PUSH_IMAGE', defaultValue: false, description: 'Publish image artifacts (manual release only)')
booleanParam(name: 'PUSH_LATEST', defaultValue: true, description: 'Also push the latest tag')
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
steps {
@ -49,19 +64,293 @@ spec:
}
}
}
stage('Build & Push (optional)') {
when {
expression { return params.PUSH_IMAGE }
stage('Collect quality evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=services/logging,dockerfiles"
"-Dsonar.inclusions=services/logging/Jenkinsfile.data-prepper,dockerfiles/Dockerfile.data-prepper"
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**"
)
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
sonar_rc=${PIPESTATUS[0]}
sonar_report="${QUALITY_GATE_SONARQUBE_REPORT:-build/sonarqube-quality-gate.json}"
host="${SONARQUBE_HOST_URL%/}"
query="$(printf '%s' "${SONARQUBE_PROJECT_KEY}" | sed 's/ /%20/g')"
sonar_ok=0
if [ -n "${SONARQUBE_TOKEN:-}" ]; then
auth="$(printf '%s:' "${SONARQUBE_TOKEN}" | base64 | tr -d '\\n')"
if command -v curl >/dev/null 2>&1; then
curl -fsS -H "Authorization: Basic ${auth}" "${host}/api/qualitygates/project_status?projectKey=${query}" > "${sonar_report}" && sonar_ok=1
elif command -v wget >/dev/null 2>&1; then
wget -qO "${sonar_report}" --header="Authorization: Basic ${auth}" "${host}/api/qualitygates/project_status?projectKey=${query}" && sonar_ok=1
fi
elif command -v curl >/dev/null 2>&1; then
curl -fsS "${host}/api/qualitygates/project_status?projectKey=${query}" > "${sonar_report}" && sonar_ok=1
elif command -v wget >/dev/null 2>&1; then
wget -qO "${sonar_report}" "${host}/api/qualitygates/project_status?projectKey=${query}" && sonar_ok=1
fi
if [ "${sonar_ok}" -ne 1 ]; then
cat > "${sonar_report}" <<EOF
{
"status": "ERROR",
"error": "sonarqube query failed"
}
EOF
fi
scan_root=build/data-prepper-supply-chain-scan
rm -rf "${scan_root}"
mkdir -p "${scan_root}/dockerfiles" "${scan_root}/services/logging"
cp dockerfiles/Dockerfile.data-prepper "${scan_root}/dockerfiles/Dockerfile.data-prepper"
cp services/logging/Jenkinsfile.data-prepper "${scan_root}/services/logging/Jenkinsfile.data-prepper"
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL "${scan_root}"
trivy_rc=$?
set -e
printf '%s\n' "${sonar_rc}" > build/sonarqube-analysis.rc
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
status=ok
compliant=true
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
status=failed
compliant=false
fi
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
'{status:$status, compliant:$compliant, category:"image_compliance", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
'''
}
container('git') {
sh '''
set -euo pipefail
apk add --no-cache curl jq >/dev/null 2>&1 || true
mkdir -p build
sonar_report="${QUALITY_GATE_SONARQUBE_REPORT:-build/sonarqube-quality-gate.json}"
if [ ! -f "${sonar_report}" ]; then
if [ -n "${SONARQUBE_HOST_URL:-}" ] && [ -n "${SONARQUBE_PROJECT_KEY:-}" ]; then
host="${SONARQUBE_HOST_URL%/}"
query="$(printf '%s' "${SONARQUBE_PROJECT_KEY}" | sed 's/ /%20/g')"
sonar_ok=0
if [ -n "${SONARQUBE_TOKEN:-}" ]; then
auth="$(printf '%s:' "${SONARQUBE_TOKEN}" | base64 | tr -d '\\n')"
if curl -fsS -H "Authorization: Basic ${auth}" "${host}/api/qualitygates/project_status?projectKey=${query}" > "${sonar_report}"; then
sonar_ok=1
fi
else
if curl -fsS "${host}/api/qualitygates/project_status?projectKey=${query}" > "${sonar_report}"; then
sonar_ok=1
fi
fi
if [ "${sonar_ok}" -ne 1 ]; then
cat > "${sonar_report}" <<EOF
{
"status": "ERROR",
"error": "sonarqube query failed"
}
EOF
fi
else
cat > "${sonar_report}" <<EOF
{
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY"
}
EOF
fi
fi
ironbank_report="${QUALITY_GATE_IRONBANK_REPORT:-build/ironbank-compliance.json}"
if [ ! -f "${ironbank_report}" ]; then
status="${IRONBANK_COMPLIANCE_STATUS:-unknown}"
compliant="${IRONBANK_COMPLIANT:-}"
if [ -n "${compliant}" ]; then
compliant_lc="$(printf '%s' "${compliant}" | tr '[:upper:]' '[:lower:]')"
compliant_json="null"
case "${compliant_lc}" in
1|true|yes|on) compliant_json="true" ;;
0|false|no|off) compliant_json="false" ;;
esac
cat > "${ironbank_report}" <<EOF
{
"status": "${status}",
"compliant": ${compliant_json},
"note": "Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT or write build/ironbank-compliance.json in image-building repos."
}
EOF
else
cat > "${ironbank_report}" <<EOF
{
"status": "${status}",
"note": "Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT or write build/ironbank-compliance.json in image-building repos."
}
EOF
fi
fi
'''
}
}
}
stage('Validation tests') {
steps {
container('git') {
sh '''#!/usr/bin/env sh
set -eu
mkdir -p build
failures=0
cases=""
dockerfile_present_status="skipped"
pipeline_config_present_status="skipped"
logging_kustomization_includes_data_prepper_status="skipped"
add_case() {
name="$1"
message="$2"
status="passed"
if [ -n "${message}" ]; then
status="failed"
failures=$((failures + 1))
cases="${cases}"'<testcase classname="data_prepper.packaging" name="'"${name}"'"><failure message="'"${message}"'" /></testcase>'
else
cases="${cases}"'<testcase classname="data_prepper.packaging" name="'"${name}"'" />'
fi
case "${name}" in
dockerfile_present) dockerfile_present_status="${status}" ;;
pipeline_config_present) pipeline_config_present_status="${status}" ;;
logging_kustomization_includes_data_prepper) logging_kustomization_includes_data_prepper_status="${status}" ;;
esac
}
if [ -s dockerfiles/Dockerfile.data-prepper ]; then
add_case "dockerfile_present" ""
else
add_case "dockerfile_present" "dockerfiles/Dockerfile.data-prepper is missing or empty"
fi
if [ -s services/logging/scripts/data_prepper_pipelines.yaml ]; then
add_case "pipeline_config_present" ""
else
add_case "pipeline_config_present" "data_prepper_pipelines.yaml is missing or empty"
fi
kustomization_contents="$(cat services/logging/kustomization.yaml 2>/dev/null || true)"
case "${kustomization_contents}" in
*data-prepper-helmrelease.yaml*) add_case "logging_kustomization_includes_data_prepper" "" ;;
*) add_case "logging_kustomization_includes_data_prepper" "services/logging/kustomization.yaml does not include data-prepper HelmRelease" ;;
esac
cat > build/junit-data-prepper.xml <<EOF
<testsuite name="data_prepper.packaging" tests="3" failures="${failures}" errors="0" skipped="0">
${cases}
</testsuite>
EOF
passed=$((3 - failures))
cat > build/test-counts.env <<EOF
test_passed_count=${passed}
test_failed_count=${failures}
test_error_count=0
test_skipped_count=0
EOF
cat > build/testcase-status.env <<EOF
dockerfile_present_status=${dockerfile_present_status}
pipeline_config_present_status=${pipeline_config_present_status}
logging_kustomization_includes_data_prepper_status=${logging_kustomization_includes_data_prepper_status}
EOF
if [ "${failures}" -ne 0 ]; then
exit 1
fi
'''
}
}
}
stage('Enforce quality gate') {
steps {
container('git') {
sh '''
set -euo pipefail
apk add --no-cache jq >/dev/null 2>&1 || true
fail=0
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(jq -r '.status // .projectStatus.status // .qualityGate.status // empty' build/sonarqube-quality-gate.json 2>/dev/null | tr '[:upper:]' '[:lower:]')"
[ -n "${sonar_status}" ] || sonar_status="missing"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-1}"
compliant="$(jq -r '.compliant // empty' build/ironbank-compliance.json 2>/dev/null || true)"
supply_status=""
if [ "${compliant}" = "true" ]; then
supply_status="ok"
elif [ "${compliant}" = "false" ]; then
supply_status="failed"
else
supply_status="$(jq -r '.status // .result // .compliance // empty' build/ironbank-compliance.json 2>/dev/null | tr '[:upper:]' '[:lower:]')"
fi
[ -n "${supply_status}" ] || supply_status="missing"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
}
}
}
stage('Build & Push') {
steps {
container('kaniko') {
withCredentials([usernamePassword(credentialsId: 'harbor-robot', usernameVariable: 'HARBOR_USERNAME', passwordVariable: 'HARBOR_PASSWORD')]) {
withCredentials([usernamePassword(credentialsId: 'harbor-robot-streaming', usernameVariable: 'HARBOR_USERNAME', passwordVariable: 'HARBOR_PASSWORD')]) {
sh '''
set -euo pipefail
if [ -z "${HARBOR_REPO:-}" ]; then
HARBOR_REPO="registry.bstein.dev/monitoring/data-prepper"
IMAGE_TAG="${IMAGE_TAG:-2.8.0}"
PUSH_LATEST="${PUSH_LATEST:-true}"
if [ -z "${HARBOR_REPO:-}" ] || [ "${HARBOR_REPO}" = "registry.bstein.dev/monitoring/data-prepper" ]; then
HARBOR_REPO="registry.bstein.dev/streaming/data-prepper"
fi
IMAGE_TAG_SAFE="${IMAGE_TAG:-2.8.0}"
mkdir -p /kaniko/.docker
ref_host="$(echo "${HARBOR_REPO}" | cut -d/ -f1)"
auth="$(printf "%s:%s" "${HARBOR_USERNAME}" "${HARBOR_PASSWORD}" | base64 | tr -d '\\n')"
@ -74,8 +363,8 @@ spec:
}
}
EOF
dest_args="--destination ${HARBOR_REPO}:${IMAGE_TAG_SAFE}"
if [ "${PUSH_LATEST:-true}" = "true" ]; then
dest_args="--destination ${HARBOR_REPO}:${IMAGE_TAG}"
if [ "${PUSH_LATEST}" = "true" ]; then
dest_args="${dest_args} --destination ${HARBOR_REPO}:latest"
fi
/kaniko/executor \
@ -88,32 +377,22 @@ EOF
}
}
}
stage('Smoke test suite') {
steps {
container('kaniko') {
sh '''
set -euo pipefail
/kaniko/executor \
--context "${WORKSPACE}" \
--dockerfile "${WORKSPACE}/dockerfiles/Dockerfile.data-prepper" \
--verbosity info \
--no-push
'''
}
}
}
}
post {
success {
always {
script {
env.QUALITY_OUTCOME = currentBuild.currentResult == 'SUCCESS' ? 'ok' : 'failed'
}
container('git') {
sh '''
set -euo pipefail
apk add --no-cache curl >/dev/null 2>&1 || true
apk add --no-cache curl jq >/dev/null 2>&1 || true
suite="${SUITE_NAME}"
gateway="${PUSHGATEWAY_URL}"
status="${QUALITY_OUTCOME:-failed}"
fetch_counter() {
status="$1"
line="$(curl -fsS "${gateway}/metrics" 2>/dev/null | awk -v suite="${suite}" -v status="${status}" '
status_name="$1"
line="$(curl -fsS "${gateway}/metrics" 2>/dev/null | awk -v suite="${suite}" -v status="${status_name}" '
/platform_quality_gate_runs_total/ {
if (index($0, "job=\\"platform-quality-ci\\"") && index($0, "suite=\\"" suite "\\"") && index($0, "status=\\"" status "\\"")) {
print $2
@ -125,54 +404,130 @@ EOF
}
ok_count="$(fetch_counter ok)"
failed_count="$(fetch_counter failed)"
ok_count=$((ok_count + 1))
tests_passed=1
tests_failed=0
cat <<METRICS | curl -fsS --data-binary @- "${gateway}/metrics/job/platform-quality-ci/suite/${suite}" >/dev/null
if [ "${status}" = "ok" ]; then
ok_count=$((ok_count + 1))
else
failed_count=$((failed_count + 1))
fi
sonarqube_check="not_applicable"
if [ -f build/sonarqube-quality-gate.json ]; then
sonar_status="$(jq -r '.status // .projectStatus.status // .qualityGate.status // empty' build/sonarqube-quality-gate.json 2>/dev/null | tr '[:upper:]' '[:lower:]')"
if [ -n "${sonar_status}" ]; then
case "${sonar_status}" in
ok|pass|passed|success) sonarqube_check="ok" ;;
*) sonarqube_check="failed" ;;
esac
else
sonarqube_check="failed"
fi
fi
supply_chain_check="not_applicable"
if [ -f build/ironbank-compliance.json ]; then
compliant="$(jq -r '.compliant // empty' build/ironbank-compliance.json 2>/dev/null)"
if [ "${compliant}" = "true" ]; then
supply_chain_check="ok"
elif [ "${compliant}" = "false" ]; then
supply_chain_check="failed"
else
ironbank_status="$(jq -r '.status // .result // .compliance // empty' build/ironbank-compliance.json 2>/dev/null | tr '[:upper:]' '[:lower:]')"
case "${ironbank_status}" in
ok|pass|passed|success|compliant) supply_chain_check="ok" ;;
"") supply_chain_check="failed" ;;
*) supply_chain_check="failed" ;;
esac
fi
fi
gate_glue_check="ok"
if [ "${status}" != "ok" ]; then
gate_glue_check="failed"
fi
metric_branch_raw="${BRANCH_NAME:-${GIT_BRANCH:-unknown}}"
metric_branch_raw="${metric_branch_raw#origin/}"
metric_branch="$(printf '%s' "${metric_branch_raw}" | jq -Rsa . | sed -e 's/^"//' -e 's/"$//')"
metric_build_number="$(printf '%s' "${BUILD_NUMBER:-unknown}" | jq -Rsa . | sed -e 's/^"//' -e 's/"$//')"
metric_jenkins_job="$(printf '%s' "${JOB_NAME:-data-prepper}" | jq -Rsa . | sed -e 's/^"//' -e 's/"$//')"
export METRIC_SUITE="${suite}"
export METRIC_BRANCH_RAW="${metric_branch_raw}"
export METRIC_BUILD_NUMBER_RAW="${BUILD_NUMBER:-unknown}"
export METRIC_JENKINS_JOB_RAW="${JOB_NAME:-data-prepper}"
if [ ! -s build/test-counts.env ] || [ ! -s build/testcase-status.env ]; then
cat > build/test-counts.env <<EOF
test_passed_count=0
test_failed_count=0
test_error_count=0
test_skipped_count=1
EOF
cat > build/testcase-status.env <<EOF
dockerfile_present_status=skipped
pipeline_config_present_status=skipped
logging_kustomization_includes_data_prepper_status=skipped
EOF
fi
. build/testcase-status.env
if [ "${dockerfile_present_status}" = "skipped" ] && [ "${pipeline_config_present_status}" = "skipped" ] && [ "${logging_kustomization_includes_data_prepper_status}" = "skipped" ]; then
cat > build/testcase-metrics.prom <<METRICS
platform_quality_gate_test_case_result{suite="${suite}",branch="${metric_branch}",build_number="${metric_build_number}",jenkins_job="${metric_jenkins_job}",test="__no_test_cases__",status="skipped"} 1
METRICS
else
cat > build/testcase-metrics.prom <<METRICS
platform_quality_gate_test_case_result{suite="${suite}",branch="${metric_branch}",build_number="${metric_build_number}",jenkins_job="${metric_jenkins_job}",test="data_prepper.packaging::dockerfile_present",status="${dockerfile_present_status}"} 1
platform_quality_gate_test_case_result{suite="${suite}",branch="${metric_branch}",build_number="${metric_build_number}",jenkins_job="${metric_jenkins_job}",test="data_prepper.packaging::pipeline_config_present",status="${pipeline_config_present_status}"} 1
platform_quality_gate_test_case_result{suite="${suite}",branch="${metric_branch}",build_number="${metric_build_number}",jenkins_job="${metric_jenkins_job}",test="data_prepper.packaging::logging_kustomization_includes_data_prepper",status="${logging_kustomization_includes_data_prepper_status}"} 1
METRICS
fi
. build/test-counts.env
tests_check="ok"
if [ "$((test_failed_count + test_error_count))" -gt 0 ]; then
tests_check="failed"
fi
cat > build/platform-quality-metrics.prom <<METRICS
# TYPE platform_quality_gate_runs_total counter
platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count}
platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count}
# TYPE data_prepper_quality_gate_tests_total gauge
data_prepper_quality_gate_tests_total{suite="${suite}",result="passed"} ${tests_passed}
data_prepper_quality_gate_tests_total{suite="${suite}",result="failed"} ${tests_failed}
data_prepper_quality_gate_tests_total{suite="${suite}",result="passed"} ${test_passed_count}
data_prepper_quality_gate_tests_total{suite="${suite}",result="failed"} ${test_failed_count}
data_prepper_quality_gate_tests_total{suite="${suite}",result="error"} ${test_error_count}
data_prepper_quality_gate_tests_total{suite="${suite}",result="skipped"} ${test_skipped_count}
# TYPE platform_quality_gate_workspace_line_coverage_percent gauge
# No coverable project source is present in this packaging suite; report full
# non-applicable coverage so rollups do not confuse N/A with uncovered code.
platform_quality_gate_workspace_line_coverage_percent{suite="${suite}"} 100
# TYPE platform_quality_gate_source_lines_over_500_total gauge
platform_quality_gate_source_lines_over_500_total{suite="${suite}"} 0
# TYPE platform_quality_gate_build_info gauge
platform_quality_gate_build_info{suite="${suite}",branch="${metric_branch}",build_number="${metric_build_number}",jenkins_job="${metric_jenkins_job}"} 1
# TYPE data_prepper_quality_gate_checks_total gauge
data_prepper_quality_gate_checks_total{suite="${suite}",check="tests",result="${tests_check}"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="coverage",result="not_applicable"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="loc",result="not_applicable"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="docs_naming",result="not_applicable"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="gate_glue",result="${gate_glue_check}"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="sonarqube",result="${sonarqube_check}"} 1
data_prepper_quality_gate_checks_total{suite="${suite}",check="supply_chain",result="${supply_chain_check}"} 1
# TYPE platform_quality_gate_test_case_result gauge
METRICS
cat build/testcase-metrics.prom >> build/platform-quality-metrics.prom
push_status="$(
curl -sS -o build/pushgateway-response.txt -w '%{http_code}' -X PUT \
--data-binary @build/platform-quality-metrics.prom \
"${gateway}/metrics/job/platform-quality-ci/suite/${suite}" || true
)"
case "${push_status}" in
200|202) ;;
*)
echo "warning: metrics push failed for suite=${suite} status=${push_status}" >&2
cat build/pushgateway-response.txt >&2 || true
;;
esac
'''
}
}
failure {
container('git') {
sh '''
set -euo pipefail
apk add --no-cache curl >/dev/null 2>&1 || true
suite="${SUITE_NAME}"
gateway="${PUSHGATEWAY_URL}"
fetch_counter() {
status="$1"
line="$(curl -fsS "${gateway}/metrics" 2>/dev/null | awk -v suite="${suite}" -v status="${status}" '
/platform_quality_gate_runs_total/ {
if (index($0, "job=\\"platform-quality-ci\\"") && index($0, "suite=\\"" suite "\\"") && index($0, "status=\\"" status "\\"")) {
print $2
exit
}
}
' || true)"
[ -n "${line}" ] && printf '%s\n' "${line}" || printf '0\n'
}
ok_count="$(fetch_counter ok)"
failed_count="$(fetch_counter failed)"
failed_count=$((failed_count + 1))
tests_passed=0
tests_failed=1
cat <<METRICS | curl -fsS --data-binary @- "${gateway}/metrics/job/platform-quality-ci/suite/${suite}" >/dev/null
# TYPE platform_quality_gate_runs_total counter
platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count}
platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count}
# TYPE data_prepper_quality_gate_tests_total gauge
data_prepper_quality_gate_tests_total{suite="${suite}",result="passed"} ${tests_passed}
data_prepper_quality_gate_tests_total{suite="${suite}",result="failed"} ${tests_failed}
METRICS
'''
script {
if (fileExists('build/junit-data-prepper.xml')) {
echo 'JUnit XML generated and archived under build/; Jenkins junit step is not installed on this controller.'
}
}
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -44,8 +44,7 @@ spec:
path: /var/log/journal
- name: fluentbit-state
emptyDir:
medium: Memory
sizeLimit: 64Mi
sizeLimit: 1Gi
extraVolumeMounts:
- name: runlogjournal
mountPath: /run/log/journal

View File

@ -12,6 +12,8 @@ spec:
type: RollingUpdate
template:
metadata:
annotations:
logging.bstein.dev/node-log-rotation-rev: "2026-04-27-3"
labels:
app: node-log-rotation
spec:

View File

@ -99,4 +99,24 @@ if [ "${changed}" -eq 1 ]; then
fi
fi
sleep infinity
trim_constrained_pod_logs() {
local base usage
for base in /host/mnt/astraios/var/log /host/var/log.hdd; do
if [ ! -d "${base}/pods" ]; then
continue
fi
usage="$(df -P "${base}" | awk 'NR==2 {gsub(/%/, "", $5); print $5}')"
if [ -z "${usage}" ] || [ "${usage}" -lt 75 ]; then
continue
fi
find "${base}/pods" -type f \( -name '[1-9]*.log' -o -name '*.log.20*' \) -size +1M -print -exec truncate -s 0 {} \; 2>/dev/null || true
if [ -d "${base}/containers" ]; then
find "${base}/containers" -xtype l -print -delete 2>/dev/null || true
fi
done
}
while true; do
trim_constrained_pod_logs
sleep 600
done

View File

@ -764,6 +764,15 @@ spec:
spec:
template:
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]
containers:
- name: tika
env:

View File

@ -1,3 +1,5 @@
"""HTTP debounce wrapper for triggering the Mailu Keycloak sync job."""
import http.server
import json
import os

View File

@ -18,13 +18,15 @@ spec:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
maintenance.bstein.dev/restart-rev: "20260207-2"
maintenance.bstein.dev/restart-rev: "20260413-jenkins-api-2"
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
{{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
export JENKINS_API_USER="{{ .Data.data.jenkins_api_user }}"
export JENKINS_API_TOKEN="{{ .Data.data.jenkins_api_token }}"
{{ end }}
{{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
@ -104,6 +106,36 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: ariadne
image: registry.bstein.dev/bstein/ariadne:latest
@ -345,6 +377,12 @@ spec:
value: "15"
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
value: "*/30 * * * *"
- name: ARIADNE_SCHEDULE_JENKINS_BUILD_WEATHER
value: "*/10 * * * *"
- name: JENKINS_BASE_URL
value: https://ci.bstein.dev
- name: JENKINS_API_TIMEOUT_SEC
value: "10"
- name: ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP
value: "45 */6 * * *"
- name: JENKINS_WORKSPACE_NAMESPACE
@ -352,7 +390,11 @@ spec:
- name: JENKINS_WORKSPACE_PVC_PREFIX
value: pvc-workspace-
- name: JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS
value: "12"
value: "24"
- name: JENKINS_WORKSPACE_CLEANUP_DRY_RUN
value: "false"
- name: JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN
value: "20"
- name: METRICS_PATH
value: "/metrics"
resources:

View File

@ -36,11 +36,29 @@ spec:
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis
name: metis-amd64
namespace: maintenance
spec:
imageRepositoryRef:
name: metis
filterTags:
pattern: '^(?P<version>0\.1\.0-\d+)-amd64$'
extract: '$version'
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis-arm64
namespace: maintenance
spec:
imageRepositoryRef:
name: metis
filterTags:
pattern: '^(?P<version>0\.1\.0-\d+)-arm64$'
extract: '$version'
policy:
semver:
range: ">=0.1.0-0"
@ -59,11 +77,29 @@ spec:
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis-sentinel
name: metis-sentinel-amd64
namespace: maintenance
spec:
imageRepositoryRef:
name: metis-sentinel
filterTags:
pattern: '^(?P<version>0\.1\.0-\d+)-amd64$'
extract: '$version'
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1beta2
kind: ImagePolicy
metadata:
name: metis-sentinel-arm64
namespace: maintenance
spec:
imageRepositoryRef:
name: metis-sentinel
filterTags:
pattern: '^(?P<version>0\.1\.0-\d+)-arm64$'
extract: '$version'
policy:
semver:
range: ">=0.1.0-0"

View File

@ -7,10 +7,13 @@ resources:
- secretproviderclass.yaml
- metis-configmap.yaml
- metis-data-pvc.yaml
- soteria-configmap.yaml
- vault-serviceaccount.yaml
- vault-sync-deployment.yaml
- ariadne-serviceaccount.yaml
- soteria-serviceaccount.yaml
- ariadne-rbac.yaml
- soteria-rbac.yaml
- disable-k3s-traefik-serviceaccount.yaml
- disable-k3s-traefik-rbac.yaml
- k3s-traefik-cleanup-rbac.yaml
@ -21,19 +24,19 @@ resources:
- pod-cleaner-rbac.yaml
- ariadne-deployment.yaml
- metis-deployment.yaml
- soteria-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml
- oneoffs/titan-24-rootfs-sweep-job.yaml
- ariadne-service.yaml
- soteria-service.yaml
- disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml
- metis-sentinel-amd64-daemonset.yaml
- metis-sentinel-arm64-daemonset.yaml
- metis-k3s-token-sync-cronjob.yaml
- k3s-agent-restart-daemonset.yaml
- pod-cleaner-cronjob.yaml
- node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml
- metis-service.yaml
- soteria-networkpolicy.yaml
- oauth2-proxy-soteria-networkpolicy.yaml
@ -45,12 +48,18 @@ resources:
- metis-ingress.yaml
images:
- name: registry.bstein.dev/bstein/ariadne
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
newTag: 0.1.0-188 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-9-amd64
newTag: 0.1.0-103-arm64 # {"$imagepolicy": "maintenance:metis-arm64:tag"}
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"}
newTag: 0.1.0-36 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator:
- name: metis-inventory
namespace: maintenance
files:
- inventory.yaml=metis-inventory.yaml
options:
disableNameSuffixHash: true
- name: disable-k3s-traefik-script
namespace: maintenance
files:
@ -69,12 +78,6 @@ configMapGenerator:
- node_nofile.sh=scripts/node_nofile.sh
options:
disableNameSuffixHash: true
- name: pod-cleaner-script
namespace: maintenance
files:
- pod_cleaner.sh=scripts/pod_cleaner.sh
options:
disableNameSuffixHash: true
- name: node-image-sweeper-script
namespace: maintenance
files:

View File

@ -2,12 +2,12 @@
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: sentinel-tls
name: recovery-tls
namespace: maintenance
spec:
secretName: sentinel-tls
secretName: recovery-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt
dnsNames:
- sentinel.bstein.dev
- recovery.bstein.dev

View File

@ -8,19 +8,21 @@ data:
METIS_BIND_ADDR: :8080
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
METIS_DATA_DIR: /var/lib/metis
METIS_DEFAULT_FLASH_HOST: titan-22
METIS_FLASH_HOSTS: titan-22,titan-24,titan-20,titan-21,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_LOCAL_HOST: titan-22
METIS_DEFAULT_FLASH_HOST: titan-20
METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_LOCAL_HOST: titan-20
METIS_ALLOWED_GROUPS: admin,maintenance
METIS_MAX_DEVICE_BYTES: "1000000000000"
METIS_NAMESPACE: maintenance
METIS_RUNNER_IMAGE_AMD64: registry.bstein.dev/bstein/metis:0.1.0-23-amd64
METIS_RUNNER_IMAGE_ARM64: registry.bstein.dev/bstein/metis:0.1.0-23-arm64
METIS_REMOTE_POD_TIMEOUT_SEC: "14400"
METIS_RUNNER_IMAGE_AMD64: registry.bstein.dev/bstein/metis:0.1.0-103-amd64 # {"$imagepolicy": "maintenance:metis-amd64"}
METIS_RUNNER_IMAGE_ARM64: registry.bstein.dev/bstein/metis:0.1.0-103-arm64 # {"$imagepolicy": "maintenance:metis-arm64"}
METIS_HARBOR_REGISTRY: registry.bstein.dev
METIS_HARBOR_PROJECT: metis
METIS_HARBOR_API_BASE: https://registry.bstein.dev/api/v2.0
METIS_HARBOR_USERNAME: admin
METIS_HOST_TMP_DIR: /tmp/metis-flash-test
METIS_HOST_TMP_DIR: /var/tmp/metis-flash-test
METIS_REMOTE_WORKSPACE_DIR: /var/tmp/metis-workspace
METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
METIS_SENTINEL_INTERVAL_SEC: "1800"
METIS_SENTINEL_NSENTER: "1"

View File

@ -2,7 +2,7 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: metis-data
name: metis-data-longhorn
namespace: maintenance
spec:
accessModes:
@ -10,4 +10,4 @@ spec:
resources:
requests:
storage: 40Gi
storageClassName: local-path
storageClassName: longhorn

View File

@ -18,7 +18,7 @@ spec:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
metis.bstein.dev/config-rev: "2026-04-06-02"
metis.bstein.dev/config-rev: "2026-04-24-01"
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "maintenance"
@ -27,9 +27,15 @@ spec:
{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}
export METIS_K3S_TOKEN="{{ .Data.data.k3s_token }}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-metis-harbor-env.sh: "kv/data/atlas/harbor/harbor-core"
vault.hashicorp.com/agent-inject-template-metis-harbor-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
export METIS_HARBOR_PASSWORD="{{ .Data.data.harbor_admin_password }}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-metis-ssh-env.sh: "kv/data/atlas/maintenance/metis-ssh-keys"
vault.hashicorp.com/agent-inject-template-metis-ssh-env.sh: |
{{ with secret "kv/data/atlas/maintenance/metis-ssh-keys" }}
export METIS_SSH_KEY_BASTION="{{ or .Data.data.bastion_pub .Data.data.brad_pub "" }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_ANANKE_TETHYS="{{ or .Data.data.ananke_tethys_pub .Data.data.hecate_tethys_pub "" }}"
export METIS_SSH_KEY_ANANKE_DB="{{ or .Data.data.ananke_db_pub .Data.data.hecate_db_pub "" }}"
@ -37,10 +43,31 @@ spec:
spec:
serviceAccountName: metis
terminationGracePeriodSeconds: 30
nodeSelector:
kubernetes.io/hostname: titan-22
kubernetes.io/arch: amd64
node-role.kubernetes.io/accelerator: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values:
- arm64
- key: longhorn-host
operator: In
values:
- "true"
- key: node-role.kubernetes.io/worker
operator: In
values:
- "true"
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
containers:
- name: metis
image: registry.bstein.dev/bstein/metis:latest
@ -49,6 +76,7 @@ spec:
args:
- >-
. /vault/secrets/metis-runtime-env.sh
&& . /vault/secrets/metis-harbor-env.sh
&& . /vault/secrets/metis-ssh-env.sh
&& exec metis serve
envFrom:
@ -72,6 +100,9 @@ spec:
periodSeconds: 5
timeoutSeconds: 2
volumeMounts:
- name: metis-inventory
mountPath: /etc/metis
readOnly: true
- name: metis-data
mountPath: /var/lib/metis
- name: host-dev
@ -93,9 +124,13 @@ spec:
privileged: true
runAsUser: 0
volumes:
- name: metis-inventory
configMap:
name: metis-inventory
defaultMode: 0444
- name: metis-data
persistentVolumeClaim:
claimName: metis-data
claimName: metis-data-longhorn
- name: host-dev
hostPath:
path: /dev

View File

@ -12,10 +12,10 @@ metadata:
spec:
ingressClassName: traefik
tls:
- hosts: ["sentinel.bstein.dev"]
secretName: sentinel-tls
- hosts: ["recovery.bstein.dev"]
secretName: recovery-tls
rules:
- host: sentinel.bstein.dev
- host: recovery.bstein.dev
http:
paths:
- path: /

View File

@ -0,0 +1,150 @@
# services/maintenance/metis-inventory.yaml
classes:
- name: rpi5-ubuntu-worker
arch: arm64
os: ubuntu-24.04
image: ${METIS_IMAGE_RPI5_UBUNTU_WORKER}
checksum: ${METIS_IMAGE_RPI5_UBUNTU_WORKER_SHA256}
k3s_version: v1.33.3+k3s1
default_labels:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
- name: rpi4-armbian-worker
arch: arm64
os: armbian-noble
image: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN}
checksum: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256}
k3s_version: v1.31.5+k3s1
default_labels:
hardware: rpi4
node-role.kubernetes.io/worker: "true"
- name: rpi4-armbian-longhorn
arch: arm64
os: armbian-noble
image: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN}
checksum: ${METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256}
k3s_version: v1.31.5+k3s1
default_labels:
hardware: rpi4
node-role.kubernetes.io/worker: "true"
nodes:
- name: titan-10
class: rpi5-ubuntu-worker
hostname: titan-10
ip: 192.168.22.36
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: ubuntu
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
- name: titan-12
class: rpi4-armbian-worker
hostname: titan-12
ip: 192.168.22.40
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
- name: titan-16
class: rpi4-armbian-worker
hostname: titan-16
ip: 192.168.22.44
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
usb_scratch:
mountpoint: /mnt/scratch
label: titan-16-scratch
fs: ext4
bind_targets:
- /var/lib/rancher
- /var/log
- name: titan-13
class: rpi4-armbian-longhorn
hostname: titan-13
ip: 192.168.22.41
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
longhorn_disks:
- mountpoint: /mnt/astreae
uuid: 6031fa8b-f28c-45c3-b7bc-6133300e07c6
fs: ext4
- mountpoint: /mnt/asteria
uuid: cbd4989d-62b5-4741-8b2a-28fdae259cae
fs: ext4
- name: titan-15
class: rpi4-armbian-longhorn
hostname: titan-15
ip: 192.168.22.43
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
longhorn_disks:
- mountpoint: /mnt/astreae
uuid: f3362f14-5822-449f-944b-ac570b5cd615
fs: ext4
- mountpoint: /mnt/asteria
uuid: 9c5316e6-f847-4884-b502-11f2d0d15d6f
fs: ext4
- name: titan-17
class: rpi4-armbian-longhorn
hostname: titan-17
ip: 192.168.22.45
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
longhorn_disks:
- mountpoint: /mnt/astreae
uuid: 1fecdade-08b0-49cb-9ae3-be6c188b0a96
fs: ext4
- mountpoint: /mnt/asteria
uuid: 2fe9f613-d372-47ca-b84f-82084e4edda0
fs: ext4
- name: titan-19
class: rpi4-armbian-longhorn
hostname: titan-19
ip: 192.168.22.47
k3s_role: agent
k3s_url: https://192.168.22.7:6443
k3s_token: ${METIS_K3S_TOKEN}
ssh_user: atlas
ssh_authorized_keys:
- ${METIS_SSH_KEY_BRAD}
- ${METIS_SSH_KEY_ANANKE_TETHYS}
- ${METIS_SSH_KEY_ANANKE_DB}
longhorn_disks:
- mountpoint: /mnt/astreae
uuid: 4890abb9-dda2-4f4f-9c0f-081ee82849cf
fs: ext4
- mountpoint: /mnt/asteria
uuid: 2b4ea28d-b0e6-4fa3-841b-cd7067ae9153
fs: ext4

View File

@ -12,6 +12,7 @@ rules:
- list
- watch
- delete
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role

View File

@ -10,6 +10,8 @@ spec:
app: metis-sentinel-amd64
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
template:
metadata:
labels:
@ -29,7 +31,7 @@ spec:
kubernetes.io/arch: amd64
containers:
- name: metis-sentinel
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-0-amd64
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-103-amd64 # {"$imagepolicy": "maintenance:metis-sentinel-amd64"}
imagePullPolicy: Always
envFrom:
- configMapRef:

View File

@ -10,6 +10,8 @@ spec:
app: metis-sentinel-arm64
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
template:
metadata:
labels:
@ -29,7 +31,7 @@ spec:
kubernetes.io/arch: arm64
containers:
- name: metis-sentinel
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-0-arm64
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-103-arm64 # {"$imagepolicy": "maintenance:metis-sentinel-arm64"}
imagePullPolicy: Always
envFrom:
- configMapRef:

View File

@ -6,7 +6,7 @@ metadata:
namespace: maintenance
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP

View File

@ -74,7 +74,7 @@ spec:
args:
- --provider=oidc
- --config=/vault/secrets/oidc-config
- --redirect-url=https://sentinel.bstein.dev/oauth2/callback
- --redirect-url=https://recovery.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
@ -96,7 +96,7 @@ spec:
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=sentinel.bstein.dev
- --cookie-domain=recovery.bstein.dev
ports:
- containerPort: 4180
name: http

View File

@ -0,0 +1,70 @@
# services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml
# One-off emergency cleanup for titan-24 rootfs pressure.
# Safe to delete the finished Job/pod after it succeeds.
apiVersion: batch/v1
kind: Job
metadata:
name: titan-24-rootfs-sweep
namespace: maintenance
annotations:
kustomize.toolkit.fluxcd.io/force: "true"
spec:
backoffLimit: 6
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: titan-24-rootfs-sweep
spec:
restartPolicy: OnFailure
nodeSelector:
kubernetes.io/hostname: titan-24
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoSchedule
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoSchedule
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
containers:
- name: sweep
image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: ONE_SHOT
value: "true"
- name: HIGH_USAGE_PERCENT
value: "0"
- name: EMERGENCY_USAGE_PERCENT
value: "0"
- name: LOG_RETENTION_DAYS
value: "1"
- name: ORPHAN_POD_RETENTION_DAYS
value: "0"
- name: JOURNAL_MAX_SIZE
value: "100M"
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: node-image-sweeper-script
defaultMode: 0555

View File

@ -51,6 +51,48 @@ for name in hdd_names:
PY
}
cleanup_orphaned_root_pod_logs() {
if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then
return 0
fi
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
import os
import shutil
import time
root_pods = "/host/var/log/pods"
active_pods = "/host/var/lib/kubelet/pods"
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
cutoff = time.time() - (retention_days * 86400)
try:
active_names = set(os.listdir(active_pods))
except Exception:
active_names = set()
try:
root_names = os.listdir(root_pods)
except Exception:
root_names = []
for name in root_names:
path = os.path.join(root_pods, name)
if not os.path.isdir(path):
continue
if name in active_names:
continue
try:
mtime = os.path.getmtime(path)
except Exception:
continue
if mtime > cutoff:
continue
print(path)
shutil.rmtree(path, ignore_errors=True)
PY
}
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
@ -61,6 +103,7 @@ sweep_once() {
fi
cleanup_orphaned_hdd_pod_logs
cleanup_orphaned_root_pod_logs
if [ -d /host/var/log.hdd/pods ]; then
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true

View File

@ -13,9 +13,32 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "soteria-restic__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/shared/soteria-restic"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "soteria-restic__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/shared/soteria-restic"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "soteria-restic__RESTIC_PASSWORD"
secretPath: "kv/data/atlas/shared/soteria-restic"
secretKey: "RESTIC_PASSWORD"
- objectName: "soteria-restic__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/shared/soteria-restic"
secretKey: "AWS_ENDPOINTS"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: soteria-restic
type: Opaque
data:
- objectName: soteria-restic__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: soteria-restic__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: soteria-restic__RESTIC_PASSWORD
key: RESTIC_PASSWORD
- objectName: soteria-restic__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -5,10 +5,26 @@ metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: longhorn
SOTERIA_BACKUP_DRIVER: restic
SOTERIA_RESTIC_SECRET_NAME: soteria-restic
SOTERIA_RESTIC_REPOSITORY: s3:https://s3.us-west-004.backblazeb2.com/atlas-soteria/soteria
SOTERIA_RESTIC_BACKUP_ARGS: --compression auto
SOTERIA_S3_ENDPOINT: https://s3.us-west-004.backblazeb2.com
SOTERIA_S3_REGION: us-west-004
SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500
SOTERIA_LONGHORN_BACKUP_MODE: incremental
SOTERIA_AUTH_REQUIRED: "true"
SOTERIA_ALLOWED_GROUPS: admin,maintenance
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
SOTERIA_METRICS_REFRESH_SECONDS: "300"
SOTERIA_B2_ENABLED: "true"
SOTERIA_B2_SECRET_NAMESPACE: maintenance
SOTERIA_B2_SECRET_NAME: soteria-restic
SOTERIA_B2_ACCESS_KEY_FIELD: AWS_ACCESS_KEY_ID
SOTERIA_B2_SECRET_KEY_FIELD: AWS_SECRET_ACCESS_KEY
SOTERIA_B2_ENDPOINT: https://s3.us-west-004.backblazeb2.com
SOTERIA_B2_REGION: us-west-004
SOTERIA_B2_BUCKETS: atlas-soteria
SOTERIA_B2_ENDPOINT_FIELD: AWS_ENDPOINTS
SOTERIA_B2_SCAN_INTERVAL_SECONDS: "900"
SOTERIA_B2_SCAN_TIMEOUT_SECONDS: "120"

View File

@ -7,7 +7,7 @@ metadata:
labels:
app: soteria
spec:
replicas: 1
replicas: 0
revisionHistoryLimit: 3
selector:
matchLabels:
@ -16,6 +16,8 @@ spec:
metadata:
labels:
app: soteria
annotations:
soteria.bstein.dev/config-revision: "2026-04-13-restic-v1"
spec:
serviceAccountName: soteria
nodeSelector:
@ -23,7 +25,28 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-10"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
@ -46,18 +69,25 @@ spec:
ports:
- name: http
containerPort: 8080
startupProbe:
httpGet:
path: /healthz
port: http
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 2
resources:
@ -73,4 +103,3 @@ spec:
drop: ["ALL"]
runAsNonRoot: true
runAsUser: 65532

View File

@ -7,6 +7,12 @@ rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims", "persistentvolumes"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get", "list", "create", "update", "delete"]
@ -26,4 +32,3 @@ subjects:
- kind: ServiceAccount
name: soteria
namespace: maintenance

View File

@ -8,7 +8,7 @@ metadata:
app: soteria
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
@ -18,4 +18,3 @@ spec:
- name: http
port: 80
targetPort: http

View File

@ -13,6 +13,8 @@ spec:
metadata:
labels:
app: maintenance-vault-sync
annotations:
maintenance.bstein.dev/restart-at: "2026-04-13T05:57:00Z"
spec:
nodeSelector:
node-role.kubernetes.io/worker: "true"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -20,39 +20,9 @@
},
"targets": [
{
"expr": "label_replace(label_replace((max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0)), \"ups\", \"Pyrphoros\", \"__name__\", \".*\"), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(label_replace((max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)), \"ups\", \"Pyrphoros\", \"__name__\", \".*\"), \"metric\", \"Runtime\", \"__name__\", \".*\") or label_replace(label_replace((max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0)), \"ups\", \"Statera\", \"__name__\", \".*\"), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(label_replace((max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)), \"ups\", \"Statera\", \"__name__\", \".*\"), \"metric\", \"Runtime\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0)",
"legendFormat": "Pyrphoros Draw (W)",
"instant": true
},
{
"refId": "B",
"expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)",
"legendFormat": "Pyrphoros Discharge",
"instant": true
},
{
"refId": "C",
"expr": "max(ananke_ups_on_battery{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)",
"legendFormat": "Pyrphoros Status",
"instant": true
},
{
"refId": "D",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0)",
"legendFormat": "Statera Draw (W)",
"instant": true
},
{
"refId": "E",
"expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)",
"legendFormat": "Statera Discharge",
"instant": true
},
{
"refId": "F",
"expr": "max(ananke_ups_on_battery{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)",
"legendFormat": "Statera Status",
"legendFormat": "{{ups}} {{metric}}",
"instant": true
}
],
@ -84,121 +54,25 @@
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Draw (W)"
"id": "byRegexp",
"options": ".*Draw$"
},
"properties": [
{
"id": "unit",
"value": "watt"
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Draw (W)"
},
"properties": [
{
"id": "unit",
"value": "watt"
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Discharge"
"id": "byRegexp",
"options": ".*Runtime$"
},
"properties": [
{
"id": "unit",
"value": "s"
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Discharge"
},
"properties": [
{
"id": "unit",
"value": "s"
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Status"
},
"properties": [
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "\u26a1 Charging"
},
"1": {
"text": "\ud83d\udd0b Discharging"
}
}
}
]
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Status"
},
"properties": [
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "\u26a1 Charging"
},
"1": {
"text": "\ud83d\udd0b Discharging"
}
}
}
]
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
}
@ -215,9 +89,15 @@
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false,
"text": {
"titleSize": 14,
"valueSize": 24
}
},
"description": "Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status."
"description": "Per-UPS live snapshot: draw, discharge runtime, and charging/discharging status."
},
{
"id": 2,
@ -243,11 +123,6 @@
"refId": "B",
"expr": "((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100)",
"legendFormat": "Statera"
},
{
"refId": "C",
"expr": "sum((ananke_ups_load_percent{job=\"ananke-power\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\"}) / 100)",
"legendFormat": "combined"
}
],
"fieldConfig": {
@ -265,7 +140,7 @@
"mode": "multi"
}
},
"description": "Historical UPS power consumption in watts for titan-db, tethys, and combined load."
"description": "Historical UPS power consumption in watts for titan-db and tethys."
},
{
"id": 3,
@ -283,27 +158,9 @@
},
"targets": [
{
"expr": "label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) or on() vector(0)), \"metric\", \"Temp \u00b0C\", \"__name__\", \".*\") or label_replace((max((max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32) or on() vector(0)), \"metric\", \"Temp \u00b0F\", \"__name__\", \".*\") or label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)) or on() vector(0)), \"metric\", \"Humidity\", \"__name__\", \".*\") or label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)) or on() vector(0)), \"metric\", \"Pressure\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max(typhon_temperature_celsius) or on() vector(0)",
"legendFormat": "Tent Temp (\u00b0C)",
"instant": true
},
{
"refId": "B",
"expr": "max(typhon_vpd_kpa) or on() vector(0)",
"legendFormat": "Tent VPD (kPa)",
"instant": true
},
{
"refId": "C",
"expr": "max(typhon_relative_humidity_percent) or on() vector(0)",
"legendFormat": "Tent RH (%)",
"instant": true
},
{
"refId": "D",
"expr": "(243.5 * (ln(clamp_min(max(typhon_relative_humidity_percent) / 100, 0.01)) + ((17.67 * max(typhon_temperature_celsius)) / (243.5 + max(typhon_temperature_celsius))))) / (17.67 - ln(clamp_min(max(typhon_relative_humidity_percent) / 100, 0.01)) - ((17.67 * max(typhon_temperature_celsius)) / (243.5 + max(typhon_temperature_celsius)))) or on() vector(0)",
"legendFormat": "Dew Point (\u00b0C)",
"legendFormat": "{{metric}}",
"instant": true
}
],
@ -336,7 +193,7 @@
{
"matcher": {
"id": "byName",
"options": "Tent Temp (\u00b0C)"
"options": "Temp \u00b0C"
},
"properties": [
{
@ -348,19 +205,19 @@
{
"matcher": {
"id": "byName",
"options": "Tent VPD (kPa)"
"options": "Temp \u00b0F"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
"value": "fahrenheit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Tent RH (%)"
"options": "Humidity"
},
"properties": [
{
@ -372,12 +229,12 @@
{
"matcher": {
"id": "byName",
"options": "Dew Point (\u00b0C)"
"options": "Pressure"
},
"properties": [
{
"id": "unit",
"value": "celsius"
"value": "suffix:kPa"
}
]
}
@ -394,9 +251,15 @@
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false,
"text": {
"titleSize": 16,
"valueSize": 28
}
},
"description": "Current tent temperature, VPD, humidity, and dew point. These render once climate telemetry is online."
"description": "Current tent values: Temp \u00b0C, Temp \u00b0F, Humidity, Pressure."
},
{
"id": 4,
@ -415,30 +278,70 @@
"targets": [
{
"refId": "A",
"expr": "typhon_temperature_celsius",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)",
"legendFormat": "Temperature (\u00b0C)"
},
{
"refId": "B",
"expr": "typhon_relative_humidity_percent",
"legendFormat": "Humidity (%)"
"expr": "(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32",
"legendFormat": "Temperature (\u00b0F)"
},
{
"refId": "C",
"expr": "typhon_vpd_kpa",
"legendFormat": "VPD (kPa)"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)",
"legendFormat": "Humidity (%)"
},
{
"refId": "D",
"expr": "(243.5 * (ln(clamp_min(typhon_relative_humidity_percent / 100, 0.01)) + ((17.67 * typhon_temperature_celsius) / (243.5 + typhon_temperature_celsius)))) / (17.67 - ln(clamp_min(typhon_relative_humidity_percent / 100, 0.01)) - ((17.67 * typhon_temperature_celsius) / (243.5 + typhon_temperature_celsius)))",
"legendFormat": "Dew Point (\u00b0C)"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)",
"legendFormat": "Pressure (VPD kPa)"
}
],
"fieldConfig": {
"defaults": {
"unit": "celsius"
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Temperature (\u00b0C)"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0C"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
"options": "Temperature (\u00b0F)"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0F"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
@ -447,31 +350,43 @@
"properties": [
{
"id": "unit",
"value": "percent"
}
]
},
{
"matcher": {
"id": "byName",
"options": "VPD (kPa)"
},
"properties": [
"value": "suffix:%"
},
{
"id": "unit",
"value": "none"
"id": "decimals",
"value": 2
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisLabel",
"value": "kPa"
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pressure (VPD kPa)"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
}
@ -486,7 +401,7 @@
"mode": "multi"
}
},
"description": "Two-axis chart: temperature/humidity/dew point (left axis) and VPD in kPa (right axis)."
"description": "Historical tent temperature (C/F), humidity, and pressure proxy (VPD kPa)."
},
{
"id": 5,
@ -504,27 +419,9 @@
},
"targets": [
{
"expr": "label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})) or on() vector(0))), \"metric\", \"Outlet\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})) or on() vector(0))), \"metric\", \"Inlet - In\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})) or on() vector(0))), \"metric\", \"Inlet - Out\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})) or on() vector(0))), \"metric\", \"Interior\", \"__name__\", \".*\")",
"refId": "A",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"outlet\"}) or on() vector(0))",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(typhon_fan_speed_level{fan_group=~\"interior|unknown\"}) or on() vector(0))",
"legendFormat": "Interior Fans",
"legendFormat": "{{metric}}",
"instant": true
}
],
@ -557,7 +454,56 @@
},
"decimals": 0
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Outlet"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inlet - In"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inlet - Out"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Interior"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
}
]
},
"options": {
"colorMode": "value",
@ -570,9 +516,11 @@
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false
},
"description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans."
"description": "Current fan activity levels: outlet, inlet in, inlet out, interior (0-10)."
},
{
"id": 6,
@ -591,23 +539,23 @@
"targets": [
{
"refId": "A",
"expr": "typhon_fan_speed_level{fan_group=\"outlet\"}",
"legendFormat": "Inside Outlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})",
"legendFormat": "Outlet"
},
{
"refId": "B",
"expr": "typhon_fan_speed_level{fan_group=\"inside_inlet\"}",
"legendFormat": "Inside Inlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})",
"legendFormat": "Inlet - Inside"
},
{
"refId": "C",
"expr": "typhon_fan_speed_level{fan_group=\"outside_inlet\"}",
"legendFormat": "Outside Inlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})",
"legendFormat": "Inlet - Outside"
},
{
"refId": "D",
"expr": "typhon_fan_speed_level{fan_group=~\"interior|unknown\"}",
"legendFormat": "Interior Fans"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})",
"legendFormat": "Interior"
}
],
"fieldConfig": {

File diff suppressed because it is too large Load Diff

View File

@ -506,7 +506,7 @@ data:
to: 0
datasourceUid: atlas-vm
model:
expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
expr: sum((1 - pvc_backup_health) > bool 0) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: unhealthy-pvcs
@ -543,6 +543,54 @@ data:
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
labels:
severity: warning
- uid: maint-soteria-b2-scan-unhealthy
title: "Soteria B2 usage scan failing or stale"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 1800
to: 0
datasourceUid: atlas-vm
model:
expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-b2-scan-unhealthy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria B2 consumption scan is failing or stale for >15m"
labels:
severity: warning
- uid: maint-soteria-authz-denials
title: "Soteria authorization denials elevated"
condition: C
@ -591,6 +639,54 @@ data:
summary: "Soteria saw >10 authorization denials in 15m"
labels:
severity: warning
- uid: maint-soteria-backup-job-storm
title: "Soteria backup job creation spike"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
expr: sum(increase(kube_job_created{namespace="maintenance",job_name=~"soteria-backup-.*"}[10m])) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-backup-jobs-created-10m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [8]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria created >8 backup jobs in 10m (possible scheduler storm)"
labels:
severity: warning
- orgId: 1
name: ariadne
folder: Alerts

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,39 +29,9 @@ data:
},
"targets": [
{
"expr": "label_replace(label_replace((max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0)), \"ups\", \"Pyrphoros\", \"__name__\", \".*\"), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(label_replace((max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)), \"ups\", \"Pyrphoros\", \"__name__\", \".*\"), \"metric\", \"Runtime\", \"__name__\", \".*\") or label_replace(label_replace((max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0)), \"ups\", \"Statera\", \"__name__\", \".*\"), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(label_replace((max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)), \"ups\", \"Statera\", \"__name__\", \".*\"), \"metric\", \"Runtime\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0)",
"legendFormat": "Pyrphoros Draw (W)",
"instant": true
},
{
"refId": "B",
"expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)",
"legendFormat": "Pyrphoros Discharge",
"instant": true
},
{
"refId": "C",
"expr": "max(ananke_ups_on_battery{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0)",
"legendFormat": "Pyrphoros Status",
"instant": true
},
{
"refId": "D",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0)",
"legendFormat": "Statera Draw (W)",
"instant": true
},
{
"refId": "E",
"expr": "max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)",
"legendFormat": "Statera Discharge",
"instant": true
},
{
"refId": "F",
"expr": "max(ananke_ups_on_battery{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0)",
"legendFormat": "Statera Status",
"legendFormat": "{{ups}} {{metric}}",
"instant": true
}
],
@ -93,121 +63,25 @@ data:
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Draw (W)"
"id": "byRegexp",
"options": ".*Draw$"
},
"properties": [
{
"id": "unit",
"value": "watt"
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Draw (W)"
},
"properties": [
{
"id": "unit",
"value": "watt"
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Discharge"
"id": "byRegexp",
"options": ".*Runtime$"
},
"properties": [
{
"id": "unit",
"value": "s"
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Discharge"
},
"properties": [
{
"id": "unit",
"value": "s"
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pyrphoros Status"
},
"properties": [
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "\u26a1 Charging"
},
"1": {
"text": "\ud83d\udd0b Discharging"
}
}
}
]
},
{
"id": "description",
"value": "Attached node: titan-db"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera Status"
},
"properties": [
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "\u26a1 Charging"
},
"1": {
"text": "\ud83d\udd0b Discharging"
}
}
}
]
},
{
"id": "description",
"value": "Attached node: titan-24"
}
]
}
@ -224,9 +98,15 @@ data:
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false,
"text": {
"titleSize": 14,
"valueSize": 24
}
},
"description": "Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status."
"description": "Per-UPS live snapshot: draw, discharge runtime, and charging/discharging status."
},
{
"id": 2,
@ -252,11 +132,6 @@ data:
"refId": "B",
"expr": "((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100)",
"legendFormat": "Statera"
},
{
"refId": "C",
"expr": "sum((ananke_ups_load_percent{job=\"ananke-power\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\"}) / 100)",
"legendFormat": "combined"
}
],
"fieldConfig": {
@ -274,7 +149,7 @@ data:
"mode": "multi"
}
},
"description": "Historical UPS power consumption in watts for titan-db, tethys, and combined load."
"description": "Historical UPS power consumption in watts for titan-db and tethys."
},
{
"id": 3,
@ -292,27 +167,9 @@ data:
},
"targets": [
{
"expr": "label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) or on() vector(0)), \"metric\", \"Temp \u00b0C\", \"__name__\", \".*\") or label_replace((max((max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32) or on() vector(0)), \"metric\", \"Temp \u00b0F\", \"__name__\", \".*\") or label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)) or on() vector(0)), \"metric\", \"Humidity\", \"__name__\", \".*\") or label_replace((max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)) or on() vector(0)), \"metric\", \"Pressure\", \"__name__\", \".*\")",
"refId": "A",
"expr": "max(typhon_temperature_celsius) or on() vector(0)",
"legendFormat": "Tent Temp (\u00b0C)",
"instant": true
},
{
"refId": "B",
"expr": "max(typhon_vpd_kpa) or on() vector(0)",
"legendFormat": "Tent VPD (kPa)",
"instant": true
},
{
"refId": "C",
"expr": "max(typhon_relative_humidity_percent) or on() vector(0)",
"legendFormat": "Tent RH (%)",
"instant": true
},
{
"refId": "D",
"expr": "(243.5 * (ln(clamp_min(max(typhon_relative_humidity_percent) / 100, 0.01)) + ((17.67 * max(typhon_temperature_celsius)) / (243.5 + max(typhon_temperature_celsius))))) / (17.67 - ln(clamp_min(max(typhon_relative_humidity_percent) / 100, 0.01)) - ((17.67 * max(typhon_temperature_celsius)) / (243.5 + max(typhon_temperature_celsius)))) or on() vector(0)",
"legendFormat": "Dew Point (\u00b0C)",
"legendFormat": "{{metric}}",
"instant": true
}
],
@ -345,7 +202,7 @@ data:
{
"matcher": {
"id": "byName",
"options": "Tent Temp (\u00b0C)"
"options": "Temp \u00b0C"
},
"properties": [
{
@ -357,19 +214,19 @@ data:
{
"matcher": {
"id": "byName",
"options": "Tent VPD (kPa)"
"options": "Temp \u00b0F"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
"value": "fahrenheit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Tent RH (%)"
"options": "Humidity"
},
"properties": [
{
@ -381,12 +238,12 @@ data:
{
"matcher": {
"id": "byName",
"options": "Dew Point (\u00b0C)"
"options": "Pressure"
},
"properties": [
{
"id": "unit",
"value": "celsius"
"value": "suffix:kPa"
}
]
}
@ -403,9 +260,15 @@ data:
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false,
"text": {
"titleSize": 16,
"valueSize": 28
}
},
"description": "Current tent temperature, VPD, humidity, and dew point. These render once climate telemetry is online."
"description": "Current tent values: Temp \u00b0C, Temp \u00b0F, Humidity, Pressure."
},
{
"id": 4,
@ -424,30 +287,70 @@ data:
"targets": [
{
"refId": "A",
"expr": "typhon_temperature_celsius",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)",
"legendFormat": "Temperature (\u00b0C)"
},
{
"refId": "B",
"expr": "typhon_relative_humidity_percent",
"legendFormat": "Humidity (%)"
"expr": "(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32",
"legendFormat": "Temperature (\u00b0F)"
},
{
"refId": "C",
"expr": "typhon_vpd_kpa",
"legendFormat": "VPD (kPa)"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)",
"legendFormat": "Humidity (%)"
},
{
"refId": "D",
"expr": "(243.5 * (ln(clamp_min(typhon_relative_humidity_percent / 100, 0.01)) + ((17.67 * typhon_temperature_celsius) / (243.5 + typhon_temperature_celsius)))) / (17.67 - ln(clamp_min(typhon_relative_humidity_percent / 100, 0.01)) - ((17.67 * typhon_temperature_celsius) / (243.5 + typhon_temperature_celsius)))",
"legendFormat": "Dew Point (\u00b0C)"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)",
"legendFormat": "Pressure (VPD kPa)"
}
],
"fieldConfig": {
"defaults": {
"unit": "celsius"
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Temperature (\u00b0C)"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0C"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
"options": "Temperature (\u00b0F)"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0F"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
@ -456,31 +359,43 @@ data:
"properties": [
{
"id": "unit",
"value": "percent"
}
]
},
{
"matcher": {
"id": "byName",
"options": "VPD (kPa)"
},
"properties": [
"value": "suffix:%"
},
{
"id": "unit",
"value": "none"
"id": "decimals",
"value": 2
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisLabel",
"value": "kPa"
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pressure (VPD kPa)"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
}
@ -495,7 +410,7 @@ data:
"mode": "multi"
}
},
"description": "Two-axis chart: temperature/humidity/dew point (left axis) and VPD in kPa (right axis)."
"description": "Historical tent temperature (C/F), humidity, and pressure proxy (VPD kPa)."
},
{
"id": 5,
@ -513,27 +428,9 @@ data:
},
"targets": [
{
"expr": "label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})) or on() vector(0))), \"metric\", \"Outlet\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})) or on() vector(0))), \"metric\", \"Inlet - In\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})) or on() vector(0))), \"metric\", \"Inlet - Out\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})) or on() vector(0))), \"metric\", \"Interior\", \"__name__\", \".*\")",
"refId": "A",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"outlet\"}) or on() vector(0))",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(typhon_fan_speed_level{fan_group=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(typhon_fan_speed_level{fan_group=~\"interior|unknown\"}) or on() vector(0))",
"legendFormat": "Interior Fans",
"legendFormat": "{{metric}}",
"instant": true
}
],
@ -566,7 +463,56 @@ data:
},
"decimals": 0
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Outlet"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inlet - In"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Inlet - Out"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
},
{
"matcher": {
"id": "byName",
"options": "Interior"
},
"properties": [
{
"id": "decimals",
"value": 0
}
]
}
]
},
"options": {
"colorMode": "value",
@ -579,9 +525,11 @@ data:
"fields": "",
"values": false
},
"textMode": "name_and_value"
"textMode": "name_and_value",
"orientation": "vertical",
"wideLayout": false
},
"description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans."
"description": "Current fan activity levels: outlet, inlet in, inlet out, interior (0-10)."
},
{
"id": 6,
@ -600,23 +548,23 @@ data:
"targets": [
{
"refId": "A",
"expr": "typhon_fan_speed_level{fan_group=\"outlet\"}",
"legendFormat": "Inside Outlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})",
"legendFormat": "Outlet"
},
{
"refId": "B",
"expr": "typhon_fan_speed_level{fan_group=\"inside_inlet\"}",
"legendFormat": "Inside Inlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})",
"legendFormat": "Inlet - Inside"
},
{
"refId": "C",
"expr": "typhon_fan_speed_level{fan_group=\"outside_inlet\"}",
"legendFormat": "Outside Inlet"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})",
"legendFormat": "Inlet - Outside"
},
{
"refId": "D",
"expr": "typhon_fan_speed_level{fan_group=~\"interior|unknown\"}",
"legendFormat": "Interior Fans"
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})",
"legendFormat": "Interior"
}
],
"fieldConfig": {

File diff suppressed because it is too large Load Diff

View File

@ -79,6 +79,7 @@ spec:
# keep 1 year; supports "d", "y"
extraArgs:
retentionPeriod: "1y" # VM flag -retentionPeriod=1y. :contentReference[oaicite:11]{index=11}
promscrape.configCheckInterval: "30s"
persistentVolume:
enabled: true
@ -91,7 +92,10 @@ spec:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-20
- titan-21
- titan-22
- titan-24
# Enable built-in Kubernetes scraping
scrape:
@ -180,6 +184,9 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: keep
source_labels: [__meta_kubernetes_pod_container_port_number]
regex: ".+"
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_name]
regex: ".*health.*"
@ -194,6 +201,7 @@ spec:
# --- kube-state-metrics (via its Service) ---
- job_name: "kube-state-metrics"
max_scrape_size: 67108864
kubernetes_sd_configs: [{ role: endpoints }]
relabel_configs:
- action: keep
@ -257,6 +265,14 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
- action: keep
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
- action: replace
regex: (.+):(?:\d+);(\d+)
replacement: $1:$2
source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
---
@ -296,7 +312,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "11"
monitoring.bstein.dev/restart-rev: "12"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
@ -424,6 +440,7 @@ spec:
type: file
disableDeletion: false
editable: false
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/overview
- name: overview-public
@ -432,6 +449,7 @@ spec:
type: file
disableDeletion: false
editable: false
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/overview-public
- name: pods
@ -440,6 +458,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/pods
- name: nodes
@ -448,6 +467,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/nodes
- name: storage
@ -456,6 +476,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/storage
- name: gpu
@ -464,6 +485,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/gpu
- name: network
@ -472,6 +494,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/network
- name: mail
@ -480,6 +503,7 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/mail
- name: jobs
@ -488,14 +512,25 @@ spec:
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/jobs
- name: testing
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/testing
- name: power
orgId: 1
folder: Atlas Internal
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards/power
dashboardsConfigMaps:
@ -508,6 +543,7 @@ spec:
network: grafana-dashboard-network
mail: grafana-dashboard-mail
jobs: grafana-dashboard-jobs
testing: grafana-dashboard-testing
power: grafana-dashboard-power
extraConfigmapMounts:
- name: grafana-folders

View File

@ -16,6 +16,7 @@ resources:
- grafana-dashboard-power.yaml
- grafana-dashboard-mail.yaml
- grafana-dashboard-jobs.yaml
- grafana-dashboard-testing.yaml
- dcgm-exporter.yaml
- jetson-tegrastats-exporter.yaml
- postmark-exporter-service.yaml

View File

@ -1,12 +1,12 @@
# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
# One-off job for monitoring/grafana-user-dedupe-api-v7.
# Purpose: grafana user dedupe api v7 (see container args/env in this file).
# One-off job for monitoring/grafana-user-dedupe-api-v8.
# Purpose: grafana user dedupe api v8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: grafana-user-dedupe-api-v7
name: grafana-user-dedupe-api-v8
namespace: monitoring
spec:
suspend: true
@ -43,13 +43,13 @@ spec:
values: ["arm64"]
containers:
- name: dedupe
image: python:3.12-slim
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- /bin/sh
- -c
args:
- |
set -euo pipefail
set -eu
for _ in $(seq 1 30); do
if [ -f /vault/secrets/grafana-env.sh ]; then
break

View File

@ -35,7 +35,7 @@ push_suite_counters() {
failed_count=$((failed_count + 1))
fi
cat <<METRICS | curl -fsS --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/platform-quality-suite-probe/suite/${suite}" >/dev/null
cat <<METRICS | curl -fsS -X PUT --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/platform-quality-suite-probe/suite/${suite}" >/dev/null
# TYPE platform_quality_gate_runs_total counter
platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count}
platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count}
@ -73,8 +73,8 @@ check_http_suite() {
failures=0
check_http_suite "atlasbot" "http://atlasbot.comms.svc.cluster.local:8090/health" "200" '"status": "ok"' || failures=$((failures + 1))
check_http_suite "pegasus-health" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1))
check_http_suite "bstein-home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1))
check_http_suite "pegasus" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1))
check_http_suite "bstein_home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1))
if [ "${failures}" -gt 0 ]; then
printf '[probe] completed with %s suite failure(s)\n' "${failures}" >&2

View File

@ -29,6 +29,36 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
serviceAccountName: sso-vault
containers:
- name: openldap

View File

@ -54,6 +54,34 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

View File

@ -20,6 +20,34 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

View File

@ -57,6 +57,34 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

Some files were not shown because too many files have changed in this diff Show More