diff --git a/services/logging/fluent-bit-helmrelease.yaml b/services/logging/fluent-bit-helmrelease.yaml index a3f1c26..df8051e 100644 --- a/services/logging/fluent-bit-helmrelease.yaml +++ b/services/logging/fluent-bit-helmrelease.yaml @@ -33,6 +33,10 @@ spec: - name: varlogjournal hostPath: path: /var/log/journal + - name: fluentbit-state + hostPath: + path: /var/lib/fluent-bit + type: DirectoryOrCreate extraVolumeMounts: - name: runlogjournal mountPath: /run/log/journal @@ -40,6 +44,8 @@ spec: - name: varlogjournal mountPath: /var/log/journal readOnly: true + - name: fluentbit-state + mountPath: /var/lib/fluent-bit config: service: | [SERVICE] @@ -51,6 +57,10 @@ spec: HTTP_Server On HTTP_Listen 0.0.0.0 HTTP_Port 2020 + storage.path /var/lib/fluent-bit/storage + storage.sync normal + storage.checksum on + storage.backlog.mem_limit 50M inputs: | [INPUT] Name tail @@ -63,14 +73,17 @@ spec: Refresh_Interval 10 Rotate_Wait 30 Inotify_Watcher false - storage.type memory + Read_from_Head On + DB /var/lib/fluent-bit/kube.db + storage.type filesystem [INPUT] Name systemd Tag journald.* Path /var/log/journal - Read_From_Tail On - storage.type memory + Read_From_Tail Off + DB /var/lib/fluent-bit/systemd.db + storage.type filesystem filters: | [FILTER] Name kubernetes diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index 9132b8e..d331308 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -6,7 +6,8 @@ resources: - opensearch-helmrelease.yaml - opensearch-dashboards-helmrelease.yaml - opensearch-ism-job.yaml + - opensearch-dashboards-setup-job.yaml + - opensearch-prune-cronjob.yaml - fluent-bit-helmrelease.yaml - - loki-helmrelease.yaml - oauth2-proxy.yaml - ingress.yaml diff --git a/services/logging/loki-helmrelease.yaml b/services/logging/loki-helmrelease.yaml deleted file mode 100644 index f14d80d..0000000 --- a/services/logging/loki-helmrelease.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# services/logging/loki-helmrelease.yaml -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: loki - namespace: logging -spec: - interval: 15m - chart: - spec: - chart: loki - version: "~6.6.0" - sourceRef: - kind: HelmRepository - name: grafana - namespace: flux-system - values: - fullnameOverride: loki - deploymentMode: SingleBinary - loki: - auth_enabled: false - commonConfig: - replication_factor: 1 - storage: - type: filesystem - storageConfig: - filesystem: - directory: /var/loki/chunks - tsdb_shipper: - active_index_directory: /var/loki/index - cache_location: /var/loki/index_cache - schemaConfig: - configs: - - from: "2024-01-01" - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: loki_index_ - period: 24h - compactor: - working_directory: /var/loki/compactor - retention_enabled: true - delete_request_store: filesystem - limits_config: - retention_period: 4320h - reject_old_samples: true - reject_old_samples_max_age: 168h - read: - replicas: 0 - write: - replicas: 0 - backend: - replicas: 0 - singleBinary: - replicas: 1 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: hardware - operator: In - values: - - rpi5 - - rpi4 - persistence: - enabled: true - size: 200Gi - storageClass: asteria - gateway: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: hardware - operator: In - values: - - rpi5 - - rpi4 - chunksCache: - allocatedMemory: 512 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: hardware - operator: In - values: - - rpi5 - - rpi4 - resultsCache: - allocatedMemory: 256 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: hardware - operator: In - values: - - rpi5 - - rpi4 - lokiCanary: - nodeSelector: - hardware: rpi5 - node-role.kubernetes.io/worker: "true" - service: - type: ClusterIP - ingress: - enabled: false diff --git a/services/logging/oauth2-proxy.yaml b/services/logging/oauth2-proxy.yaml index ef3621f..ecebfa7 100644 --- a/services/logging/oauth2-proxy.yaml +++ b/services/logging/oauth2-proxy.yaml @@ -55,6 +55,7 @@ spec: - --oidc-issuer-url=https://sso.bstein.dev/realms/atlas - --scope=openid profile email - --email-domain=* + - --code-challenge-method=S256 - --set-xauthrequest=true - --pass-access-token=true - --set-authorization-header=true diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/opensearch-dashboards-setup-job.yaml new file mode 100644 index 0000000..fa308cc --- /dev/null +++ b/services/logging/opensearch-dashboards-setup-job.yaml @@ -0,0 +1,63 @@ +# services/logging/opensearch-dashboards-setup-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: opensearch-dashboards-setup-1 + namespace: logging +spec: + backoffLimit: 3 + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: OnFailure + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + containers: + - name: setup + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + apk add --no-cache curl >/dev/null + + OSD_URL="http://opensearch-dashboards.logging.svc.cluster.local:5601" + for attempt in $(seq 1 60); do + code="$(curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" || true)" + if [ "${code}" = "200" ]; then + break + fi + sleep 5 + done + + if ! curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" | grep -q "200"; then + echo "OpenSearch Dashboards did not become ready in time" >&2 + exit 1 + fi + + create_view() { + view_id="$1" + title="$2" + curl -sS -X POST "${OSD_URL}/api/saved_objects/index-pattern/${view_id}?overwrite=true" \ + -H 'Content-Type: application/json' \ + -H 'osd-xsrf: true' \ + -d "{\"attributes\":{\"title\":\"${title}\",\"timeFieldName\":\"@timestamp\"}}" >/dev/null + } + + create_view kube-logs "kube-*" + create_view journald-logs "journald-*" + + curl -sS -X POST "${OSD_URL}/api/opensearch-dashboards/settings" \ + -H 'Content-Type: application/json' \ + -H 'osd-xsrf: true' \ + -d '{"changes":{"defaultIndex":"kube-logs"}}' >/dev/null diff --git a/services/logging/opensearch-helmrelease.yaml b/services/logging/opensearch-helmrelease.yaml index 3d7dd6b..627dee4 100644 --- a/services/logging/opensearch-helmrelease.yaml +++ b/services/logging/opensearch-helmrelease.yaml @@ -32,7 +32,7 @@ spec: persistence: enabled: true storageClass: asteria - size: 500Gi + size: 1024Gi config: opensearch.yml: | cluster.name: opensearch diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml new file mode 100644 index 0000000..74e2837 --- /dev/null +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -0,0 +1,132 @@ +# services/logging/opensearch-prune-cronjob.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: opensearch-prune-script + namespace: logging +data: + prune.py: | + import json + import os + import re + import sys + import urllib.error + import urllib.request + + os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/") + limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4))) + patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()] + + UNITS = { + "b": 1, + "kb": 1024, + "mb": 1024**2, + "gb": 1024**3, + "tb": 1024**4, + } + + def parse_size(value: str) -> int: + if not value: + return 0 + text = value.strip().lower() + if text in ("-", "0"): + return 0 + match = re.match(r"^([0-9.]+)([a-z]+)$", text) + if not match: + return 0 + number = float(match.group(1)) + unit = match.group(2) + if unit not in UNITS: + return 0 + return int(number * UNITS[unit]) + + def request_json(path: str): + url = f"{os_url}{path}" + with urllib.request.urlopen(url, timeout=30) as response: + payload = response.read().decode("utf-8") + return json.loads(payload) + + def delete_index(index: str) -> None: + url = f"{os_url}/{index}" + req = urllib.request.Request(url, method="DELETE") + with urllib.request.urlopen(req, timeout=30) as response: + _ = response.read() + print(f"deleted {index}") + + indices = [] + for pattern in patterns: + try: + data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date") + except urllib.error.HTTPError as exc: + if exc.code == 404: + continue + raise + for item in data: + index = item.get("index") + if not index or index.startswith("."): + continue + size = parse_size(item.get("store.size", "")) + created = int(item.get("creation.date", "0") or 0) + indices.append({"index": index, "size": size, "created": created}) + + total = sum(item["size"] for item in indices) + print(f"total_log_bytes={total}") + if total <= limit_bytes: + print("within limit") + sys.exit(0) + + indices.sort(key=lambda item: item["created"]) + for item in indices: + if total <= limit_bytes: + break + delete_index(item["index"]) + total -= item["size"] + + print(f"remaining_log_bytes={total}") +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: opensearch-prune + namespace: logging +spec: + schedule: "23 3 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 2 + template: + spec: + restartPolicy: OnFailure + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 + containers: + - name: prune + image: python:3.11-alpine + command: ["python", "/scripts/prune.py"] + env: + - name: OPENSEARCH_URL + value: http://opensearch-master.logging.svc.cluster.local:9200 + - name: LOG_LIMIT_BYTES + value: "1099511627776" + - name: LOG_INDEX_PATTERNS + value: "kube-*,journald-*" + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: opensearch-prune-script diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 873a323..ddd24e5 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -320,13 +320,6 @@ spec: timeInterval: "15s" uid: atlas-vm orgId: 2 - - name: Loki - type: loki - access: proxy - url: http://loki.logging.svc.cluster.local:3100 - isDefault: false - uid: atlas-loki - orgId: 1 dashboardProviders: dashboardproviders.yaml: apiVersion: 1