From 0a8e8e27daf8643cb7b093d100661ffa45ddbf4d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 6 Jan 2026 14:53:19 -0300 Subject: [PATCH] knowledge: add runbooks skeleton --- .gitignore | 1 + knowledge/INDEX.md | 22 +++++++++++++++++ knowledge/runbooks/ci-gitea-jenkins.md | 27 ++++++++++++++++++++ knowledge/runbooks/kb-authoring.md | 34 ++++++++++++++++++++++++++ knowledge/runbooks/observability.md | 26 ++++++++++++++++++++ knowledge/runbooks/template.md | 18 ++++++++++++++ 6 files changed, 128 insertions(+) create mode 100644 knowledge/INDEX.md create mode 100644 knowledge/runbooks/ci-gitea-jenkins.md create mode 100644 knowledge/runbooks/kb-authoring.md create mode 100644 knowledge/runbooks/observability.md create mode 100644 knowledge/runbooks/template.md diff --git a/.gitignore b/.gitignore index 7bf3646..1d2e516 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.md !README.md +!knowledge/**/*.md __pycache__/ *.py[cod] diff --git a/knowledge/INDEX.md b/knowledge/INDEX.md new file mode 100644 index 0000000..fac9153 --- /dev/null +++ b/knowledge/INDEX.md @@ -0,0 +1,22 @@ +Atlas Knowledge Base (KB) + +This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be: +- Accurate (grounded in GitOps + read-only cluster tools) +- Maintainable (small docs + deterministic generators) +- Safe (no secrets; refer to Secret/Vault paths by name only) + +Layout +- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown). +- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON). +- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog. + +Regeneration +- Update manifests/docs, then regenerate generated artifacts: + - `python scripts/knowledge_render_atlas.py --write` + +Authoring rules +- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`. +- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths. +- Keep each runbook small; one topic per file; use headings. +- When in doubt, link to the exact file path in this repo that configures the behavior. + diff --git a/knowledge/runbooks/ci-gitea-jenkins.md b/knowledge/runbooks/ci-gitea-jenkins.md new file mode 100644 index 0000000..48dc91f --- /dev/null +++ b/knowledge/runbooks/ci-gitea-jenkins.md @@ -0,0 +1,27 @@ +--- +title: "CI: Gitea → Jenkins pipeline" +tags: ["atlas", "ci", "gitea", "jenkins"] +owners: ["brad"] +entrypoints: ["scm.bstein.dev", "ci.bstein.dev"] +source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"] +--- + +# CI: Gitea → Jenkins pipeline + +## What this is +Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO). + +## Where it is configured +- Gitea manifests: `services/gitea/` +- Jenkins manifests: `services/jenkins/` +- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh` + +## What users do (typical flow) +- Create a repo in Gitea. +- Create/update a Jenkins job/pipeline that can fetch the repo. +- Configure a webhook (or SCM polling) so pushes trigger builds. + +## Troubleshooting (common) +- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable. +- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured. + diff --git a/knowledge/runbooks/kb-authoring.md b/knowledge/runbooks/kb-authoring.md new file mode 100644 index 0000000..9378d1d --- /dev/null +++ b/knowledge/runbooks/kb-authoring.md @@ -0,0 +1,34 @@ +--- +title: "KB authoring: what to write (and what not to)" +tags: ["atlas", "kb", "runbooks"] +owners: ["brad"] +entrypoints: [] +source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"] +--- + +# KB authoring: what to write (and what not to) + +## The goal +Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing. + +## What to capture (high value) +- User workflows: “click here, set X, expected result” +- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands” +- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc” +- Failure modes: exact error messages + the 2–5 checks that usually resolve them +- Permissions: Keycloak groups/roles and what they unlock + +## What to avoid (low value / fluff) +- Generic Kubernetes explanations (link to upstream docs instead) +- Copy-pasting large manifests (prefer file paths + small snippets) +- Anything that will drift quickly (render it from GitOps instead) +- Any secret values (reference Secret/Vault locations by name only) + +## Document pattern (recommended) +Each runbook should answer: +- “What is this?” +- “What do users do?” +- “What do operators change (where in Git)?” +- “How do we verify it works?” +- “What breaks and how to debug it?” + diff --git a/knowledge/runbooks/observability.md b/knowledge/runbooks/observability.md new file mode 100644 index 0000000..4c5be6e --- /dev/null +++ b/knowledge/runbooks/observability.md @@ -0,0 +1,26 @@ +--- +title: "Observability: Grafana + VictoriaMetrics (how to query safely)" +tags: ["atlas", "monitoring", "grafana", "victoriametrics"] +owners: ["brad"] +entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"] +source_paths: ["services/monitoring"] +--- + +# Observability: Grafana + VictoriaMetrics (how to query safely) + +## Where it is configured +- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values) +- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL) + +## Using metrics as a “tool” for Atlas assistants +The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results. + +Examples (intents) +- “Is the cluster healthy?” → node readiness + pod restart rate +- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx +- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported) + +## Why dashboards are not the KB +Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the +KB focused on wiring, runbooks, and stable conventions. + diff --git a/knowledge/runbooks/template.md b/knowledge/runbooks/template.md new file mode 100644 index 0000000..086c65f --- /dev/null +++ b/knowledge/runbooks/template.md @@ -0,0 +1,18 @@ +--- +title: "" +tags: ["atlas", "", ""] +owners: ["brad"] +entrypoints: [""] +source_paths: ["services/", "clusters/atlas/<...>"] +--- + +# + +## What this is + +## For users (how to) + +## For operators (where configured) + +## Troubleshooting (symptoms → checks) +