logging: remove loki and backfill to opensearch

2026-01-09 18:08:39 -03:00 · 2026-01-09 18:08:39 -03:00 · 0b78ec663d
commit 0b78ec663d
parent 456677cfbb
8 changed files with 215 additions and 125 deletions
--- a/services/logging/fluent-bit-helmrelease.yaml
+++ b/services/logging/fluent-bit-helmrelease.yaml
@ -33,6 +33,10 @@ spec:
      - name: varlogjournal
        hostPath:
          path: /var/log/journal
+      - name: fluentbit-state
+        hostPath:
+          path: /var/lib/fluent-bit
+          type: DirectoryOrCreate
    extraVolumeMounts:
      - name: runlogjournal
        mountPath: /run/log/journal
@ -40,6 +44,8 @@ spec:
      - name: varlogjournal
        mountPath: /var/log/journal
        readOnly: true
+      - name: fluentbit-state
+        mountPath: /var/lib/fluent-bit
    config:
      service: |
        [SERVICE]
@ -51,6 +57,10 @@ spec:
            HTTP_Server On
            HTTP_Listen 0.0.0.0
            HTTP_Port 2020
+            storage.path /var/lib/fluent-bit/storage
+            storage.sync normal
+            storage.checksum on
+            storage.backlog.mem_limit 50M
      inputs: |
        [INPUT]
            Name tail
@ -63,14 +73,17 @@ spec:
            Refresh_Interval 10
            Rotate_Wait 30
            Inotify_Watcher false
-            storage.type memory
+            Read_from_Head On
+            DB /var/lib/fluent-bit/kube.db
+            storage.type filesystem

        [INPUT]
            Name systemd
            Tag journald.*
            Path /var/log/journal
-            Read_From_Tail On
-            storage.type memory
+            Read_From_Tail Off
+            DB /var/lib/fluent-bit/systemd.db
+            storage.type filesystem
      filters: |
        [FILTER]
            Name kubernetes
--- a/services/logging/kustomization.yaml
+++ b/services/logging/kustomization.yaml
@ -6,7 +6,8 @@ resources:
  - opensearch-helmrelease.yaml
  - opensearch-dashboards-helmrelease.yaml
  - opensearch-ism-job.yaml
+  - opensearch-dashboards-setup-job.yaml
+  - opensearch-prune-cronjob.yaml
  - fluent-bit-helmrelease.yaml
-  - loki-helmrelease.yaml
  - oauth2-proxy.yaml
  - ingress.yaml
--- a/services/logging/loki-helmrelease.yaml
+++ b/services/logging/loki-helmrelease.yaml
@ -1,113 +0,0 @@
-# services/logging/loki-helmrelease.yaml
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: loki
-  namespace: logging
-spec:
-  interval: 15m
-  chart:
-    spec:
-      chart: loki
-      version: "~6.6.0"
-      sourceRef:
-        kind: HelmRepository
-        name: grafana
-        namespace: flux-system
-  values:
-    fullnameOverride: loki
-    deploymentMode: SingleBinary
-    loki:
-      auth_enabled: false
-      commonConfig:
-        replication_factor: 1
-      storage:
-        type: filesystem
-      storageConfig:
-        filesystem:
-          directory: /var/loki/chunks
-        tsdb_shipper:
-          active_index_directory: /var/loki/index
-          cache_location: /var/loki/index_cache
-      schemaConfig:
-        configs:
-          - from: "2024-01-01"
-            store: tsdb
-            object_store: filesystem
-            schema: v13
-            index:
-              prefix: loki_index_
-              period: 24h
-      compactor:
-        working_directory: /var/loki/compactor
-        retention_enabled: true
-        delete_request_store: filesystem
-      limits_config:
-        retention_period: 4320h
-        reject_old_samples: true
-        reject_old_samples_max_age: 168h
-    read:
-      replicas: 0
-    write:
-      replicas: 0
-    backend:
-      replicas: 0
-    singleBinary:
-      replicas: 1
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-      persistence:
-        enabled: true
-        size: 200Gi
-        storageClass: asteria
-    gateway:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-    chunksCache:
-      allocatedMemory: 512
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-    resultsCache:
-      allocatedMemory: 256
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-    lokiCanary:
-      nodeSelector:
-        hardware: rpi5
-        node-role.kubernetes.io/worker: "true"
-    service:
-      type: ClusterIP
-    ingress:
-      enabled: false
--- a/services/logging/oauth2-proxy.yaml
+++ b/services/logging/oauth2-proxy.yaml
@ -55,6 +55,7 @@ spec:
            - --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
            - --scope=openid profile email
            - --email-domain=*
+            - --code-challenge-method=S256
            - --set-xauthrequest=true
            - --pass-access-token=true
            - --set-authorization-header=true
--- a/services/logging/opensearch-dashboards-setup-job.yaml
+++ b/services/logging/opensearch-dashboards-setup-job.yaml
@ -0,0 +1,63 @@
+# services/logging/opensearch-dashboards-setup-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: opensearch-dashboards-setup-1
+  namespace: logging
+spec:
+  backoffLimit: 3
+  ttlSecondsAfterFinished: 3600
+  template:
+    spec:
+      restartPolicy: OnFailure
+      nodeSelector:
+        node-role.kubernetes.io/worker: "true"
+        hardware: rpi5
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values:
+                      - rpi5
+      containers:
+        - name: setup
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -euo pipefail
+              apk add --no-cache curl >/dev/null
+
+              OSD_URL="http://opensearch-dashboards.logging.svc.cluster.local:5601"
+              for attempt in $(seq 1 60); do
+                code="$(curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" || true)"
+                if [ "${code}" = "200" ]; then
+                  break
+                fi
+                sleep 5
+              done
+
+              if ! curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" | grep -q "200"; then
+                echo "OpenSearch Dashboards did not become ready in time" >&2
+                exit 1
+              fi
+
+              create_view() {
+                view_id="$1"
+                title="$2"
+                curl -sS -X POST "${OSD_URL}/api/saved_objects/index-pattern/${view_id}?overwrite=true" \
+                  -H 'Content-Type: application/json' \
+                  -H 'osd-xsrf: true' \
+                  -d "{\"attributes\":{\"title\":\"${title}\",\"timeFieldName\":\"@timestamp\"}}" >/dev/null
+              }
+
+              create_view kube-logs "kube-*"
+              create_view journald-logs "journald-*"
+
+              curl -sS -X POST "${OSD_URL}/api/opensearch-dashboards/settings" \
+                -H 'Content-Type: application/json' \
+                -H 'osd-xsrf: true' \
+                -d '{"changes":{"defaultIndex":"kube-logs"}}' >/dev/null
--- a/services/logging/opensearch-helmrelease.yaml
+++ b/services/logging/opensearch-helmrelease.yaml
@ -32,7 +32,7 @@ spec:
    persistence:
      enabled: true
      storageClass: asteria
-      size: 500Gi
+      size: 1024Gi
    config:
      opensearch.yml: |
        cluster.name: opensearch
--- a/services/logging/opensearch-prune-cronjob.yaml
+++ b/services/logging/opensearch-prune-cronjob.yaml
@ -0,0 +1,132 @@
+# services/logging/opensearch-prune-cronjob.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: opensearch-prune-script
+  namespace: logging
+data:
+  prune.py: |
+    import json
+    import os
+    import re
+    import sys
+    import urllib.error
+    import urllib.request
+
+    os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
+    limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
+    patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
+
+    UNITS = {
+        "b": 1,
+        "kb": 1024,
+        "mb": 1024**2,
+        "gb": 1024**3,
+        "tb": 1024**4,
+    }
+
+    def parse_size(value: str) -> int:
+        if not value:
+            return 0
+        text = value.strip().lower()
+        if text in ("-", "0"):
+            return 0
+        match = re.match(r"^([0-9.]+)([a-z]+)$", text)
+        if not match:
+            return 0
+        number = float(match.group(1))
+        unit = match.group(2)
+        if unit not in UNITS:
+            return 0
+        return int(number * UNITS[unit])
+
+    def request_json(path: str):
+        url = f"{os_url}{path}"
+        with urllib.request.urlopen(url, timeout=30) as response:
+            payload = response.read().decode("utf-8")
+        return json.loads(payload)
+
+    def delete_index(index: str) -> None:
+        url = f"{os_url}/{index}"
+        req = urllib.request.Request(url, method="DELETE")
+        with urllib.request.urlopen(req, timeout=30) as response:
+            _ = response.read()
+        print(f"deleted {index}")
+
+    indices = []
+    for pattern in patterns:
+        try:
+            data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
+        except urllib.error.HTTPError as exc:
+            if exc.code == 404:
+                continue
+            raise
+        for item in data:
+            index = item.get("index")
+            if not index or index.startswith("."):
+                continue
+            size = parse_size(item.get("store.size", ""))
+            created = int(item.get("creation.date", "0") or 0)
+            indices.append({"index": index, "size": size, "created": created})
+
+    total = sum(item["size"] for item in indices)
+    print(f"total_log_bytes={total}")
+    if total <= limit_bytes:
+        print("within limit")
+        sys.exit(0)
+
+    indices.sort(key=lambda item: item["created"])
+    for item in indices:
+        if total <= limit_bytes:
+            break
+        delete_index(item["index"])
+        total -= item["size"]
+
+    print(f"remaining_log_bytes={total}")
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: opensearch-prune
+  namespace: logging
+spec:
+  schedule: "23 3 * * *"
+  concurrencyPolicy: Forbid
+  successfulJobsHistoryLimit: 1
+  failedJobsHistoryLimit: 3
+  jobTemplate:
+    spec:
+      backoffLimit: 2
+      template:
+        spec:
+          restartPolicy: OnFailure
+          nodeSelector:
+            node-role.kubernetes.io/worker: "true"
+            hardware: rpi5
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: hardware
+                        operator: In
+                        values:
+                          - rpi5
+          containers:
+            - name: prune
+              image: python:3.11-alpine
+              command: ["python", "/scripts/prune.py"]
+              env:
+                - name: OPENSEARCH_URL
+                  value: http://opensearch-master.logging.svc.cluster.local:9200
+                - name: LOG_LIMIT_BYTES
+                  value: "1099511627776"
+                - name: LOG_INDEX_PATTERNS
+                  value: "kube-*,journald-*"
+              volumeMounts:
+                - name: scripts
+                  mountPath: /scripts
+          volumes:
+            - name: scripts
+              configMap:
+                name: opensearch-prune-script
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -320,13 +320,6 @@ spec:
              timeInterval: "15s"
            uid: atlas-vm
            orgId: 2
-          - name: Loki
-            type: loki
-            access: proxy
-            url: http://loki.logging.svc.cluster.local:3100
-            isDefault: false
-            uid: atlas-loki
-            orgId: 1
    dashboardProviders:
      dashboardproviders.yaml:
        apiVersion: 1