titan-iac/services/monitoring/grafana-org-bootstrap.yaml

128 lines
4.9 KiB
YAML

# services/monitoring/grafana-org-bootstrap.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: grafana-org-bootstrap-3
namespace: monitoring
spec:
backoffLimit: 2
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "monitoring"
vault.hashicorp.com/agent-inject-secret-grafana-env: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env: |
{{- with secret "kv/data/atlas/monitoring/grafana-admin" -}}
export GRAFANA_USER="{{ index .Data.data "admin-user" }}"
export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}"
{{- end -}}
spec:
restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
serviceAccountName: monitoring-vault-sync
containers:
- name: bootstrap
image: python:3.11-alpine
env:
- name: GRAFANA_URL
value: http://grafana
- name: OVERVIEW_ORG_NAME
value: Overview
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
. /vault/secrets/grafana-env
python - <<'PY'
import base64
import json
import os
import time
import urllib.error
import urllib.request
grafana_url = os.environ["GRAFANA_URL"].rstrip("/")
org_name = os.environ["OVERVIEW_ORG_NAME"]
user = os.environ["GRAFANA_USER"]
password = os.environ["GRAFANA_PASSWORD"]
auth = base64.b64encode(f"{user}:{password}".encode()).decode()
base_headers = {
"Authorization": f"Basic {auth}",
"Content-Type": "application/json",
}
def request(path, method="GET", data=None, org_id=None):
headers = dict(base_headers)
if org_id is not None:
headers["X-Grafana-Org-Id"] = str(org_id)
payload = None
if data is not None:
payload = json.dumps(data).encode()
req = urllib.request.Request(
f"{grafana_url}{path}",
data=payload,
headers=headers,
method=method,
)
return urllib.request.urlopen(req, timeout=10)
for _ in range(60):
try:
with request("/api/health") as resp:
if resp.status == 200:
break
except Exception:
time.sleep(2)
else:
raise SystemExit("Grafana API did not become ready in time")
with request("/api/orgs") as resp:
orgs = json.load(resp)
org_id = next((org["id"] for org in orgs if org["name"] == org_name), None)
if org_id is None:
with request("/api/orgs", method="POST", data={"name": org_name}) as resp:
org_id = json.load(resp).get("orgId")
if org_id is None:
raise SystemExit(f"Unable to resolve org ID for {org_name}")
datasource = {
"name": "VictoriaMetrics",
"type": "prometheus",
"access": "proxy",
"url": "http://victoria-metrics-single-server:8428",
"isDefault": True,
"uid": "atlas-vm",
"jsonData": {"timeInterval": "15s"},
}
try:
with request("/api/datasources/uid/atlas-vm", org_id=org_id) as resp:
if resp.status != 200:
raise urllib.error.HTTPError(resp.url, resp.status, resp.reason, resp.headers, None)
except urllib.error.HTTPError as err:
if err.code != 404:
raise
with request("/api/datasources", method="POST", data=datasource, org_id=org_id):
pass
with request("/api/admin/provisioning/datasources/reload", method="POST"):
pass
with request("/api/admin/provisioning/dashboards/reload", method="POST"):
pass
PY