logging: remove loki and backfill to opensearch

This commit is contained in:
Brad Stein 2026-01-09 18:08:39 -03:00
parent 456677cfbb
commit 0b78ec663d
8 changed files with 215 additions and 125 deletions

View File

@ -33,6 +33,10 @@ spec:
- name: varlogjournal
hostPath:
path: /var/log/journal
- name: fluentbit-state
hostPath:
path: /var/lib/fluent-bit
type: DirectoryOrCreate
extraVolumeMounts:
- name: runlogjournal
mountPath: /run/log/journal
@ -40,6 +44,8 @@ spec:
- name: varlogjournal
mountPath: /var/log/journal
readOnly: true
- name: fluentbit-state
mountPath: /var/lib/fluent-bit
config:
service: |
[SERVICE]
@ -51,6 +57,10 @@ spec:
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020
storage.path /var/lib/fluent-bit/storage
storage.sync normal
storage.checksum on
storage.backlog.mem_limit 50M
inputs: |
[INPUT]
Name tail
@ -63,14 +73,17 @@ spec:
Refresh_Interval 10
Rotate_Wait 30
Inotify_Watcher false
storage.type memory
Read_from_Head On
DB /var/lib/fluent-bit/kube.db
storage.type filesystem
[INPUT]
Name systemd
Tag journald.*
Path /var/log/journal
Read_From_Tail On
storage.type memory
Read_From_Tail Off
DB /var/lib/fluent-bit/systemd.db
storage.type filesystem
filters: |
[FILTER]
Name kubernetes

View File

@ -6,7 +6,8 @@ resources:
- opensearch-helmrelease.yaml
- opensearch-dashboards-helmrelease.yaml
- opensearch-ism-job.yaml
- opensearch-dashboards-setup-job.yaml
- opensearch-prune-cronjob.yaml
- fluent-bit-helmrelease.yaml
- loki-helmrelease.yaml
- oauth2-proxy.yaml
- ingress.yaml

View File

@ -1,113 +0,0 @@
# services/logging/loki-helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: loki
namespace: logging
spec:
interval: 15m
chart:
spec:
chart: loki
version: "~6.6.0"
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system
values:
fullnameOverride: loki
deploymentMode: SingleBinary
loki:
auth_enabled: false
commonConfig:
replication_factor: 1
storage:
type: filesystem
storageConfig:
filesystem:
directory: /var/loki/chunks
tsdb_shipper:
active_index_directory: /var/loki/index
cache_location: /var/loki/index_cache
schemaConfig:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: loki_index_
period: 24h
compactor:
working_directory: /var/loki/compactor
retention_enabled: true
delete_request_store: filesystem
limits_config:
retention_period: 4320h
reject_old_samples: true
reject_old_samples_max_age: 168h
read:
replicas: 0
write:
replicas: 0
backend:
replicas: 0
singleBinary:
replicas: 1
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
persistence:
enabled: true
size: 200Gi
storageClass: asteria
gateway:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
chunksCache:
allocatedMemory: 512
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
resultsCache:
allocatedMemory: 256
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- rpi4
lokiCanary:
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
service:
type: ClusterIP
ingress:
enabled: false

View File

@ -55,6 +55,7 @@ spec:
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email
- --email-domain=*
- --code-challenge-method=S256
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true

View File

@ -0,0 +1,63 @@
# services/logging/opensearch-dashboards-setup-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: opensearch-dashboards-setup-1
namespace: logging
spec:
backoffLimit: 3
ttlSecondsAfterFinished: 3600
template:
spec:
restartPolicy: OnFailure
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
containers:
- name: setup
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
apk add --no-cache curl >/dev/null
OSD_URL="http://opensearch-dashboards.logging.svc.cluster.local:5601"
for attempt in $(seq 1 60); do
code="$(curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" || true)"
if [ "${code}" = "200" ]; then
break
fi
sleep 5
done
if ! curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" | grep -q "200"; then
echo "OpenSearch Dashboards did not become ready in time" >&2
exit 1
fi
create_view() {
view_id="$1"
title="$2"
curl -sS -X POST "${OSD_URL}/api/saved_objects/index-pattern/${view_id}?overwrite=true" \
-H 'Content-Type: application/json' \
-H 'osd-xsrf: true' \
-d "{\"attributes\":{\"title\":\"${title}\",\"timeFieldName\":\"@timestamp\"}}" >/dev/null
}
create_view kube-logs "kube-*"
create_view journald-logs "journald-*"
curl -sS -X POST "${OSD_URL}/api/opensearch-dashboards/settings" \
-H 'Content-Type: application/json' \
-H 'osd-xsrf: true' \
-d '{"changes":{"defaultIndex":"kube-logs"}}' >/dev/null

View File

@ -32,7 +32,7 @@ spec:
persistence:
enabled: true
storageClass: asteria
size: 500Gi
size: 1024Gi
config:
opensearch.yml: |
cluster.name: opensearch

View File

@ -0,0 +1,132 @@
# services/logging/opensearch-prune-cronjob.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: opensearch-prune-script
namespace: logging
data:
prune.py: |
import json
import os
import re
import sys
import urllib.error
import urllib.request
os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
UNITS = {
"b": 1,
"kb": 1024,
"mb": 1024**2,
"gb": 1024**3,
"tb": 1024**4,
}
def parse_size(value: str) -> int:
if not value:
return 0
text = value.strip().lower()
if text in ("-", "0"):
return 0
match = re.match(r"^([0-9.]+)([a-z]+)$", text)
if not match:
return 0
number = float(match.group(1))
unit = match.group(2)
if unit not in UNITS:
return 0
return int(number * UNITS[unit])
def request_json(path: str):
url = f"{os_url}{path}"
with urllib.request.urlopen(url, timeout=30) as response:
payload = response.read().decode("utf-8")
return json.loads(payload)
def delete_index(index: str) -> None:
url = f"{os_url}/{index}"
req = urllib.request.Request(url, method="DELETE")
with urllib.request.urlopen(req, timeout=30) as response:
_ = response.read()
print(f"deleted {index}")
indices = []
for pattern in patterns:
try:
data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
except urllib.error.HTTPError as exc:
if exc.code == 404:
continue
raise
for item in data:
index = item.get("index")
if not index or index.startswith("."):
continue
size = parse_size(item.get("store.size", ""))
created = int(item.get("creation.date", "0") or 0)
indices.append({"index": index, "size": size, "created": created})
total = sum(item["size"] for item in indices)
print(f"total_log_bytes={total}")
if total <= limit_bytes:
print("within limit")
sys.exit(0)
indices.sort(key=lambda item: item["created"])
for item in indices:
if total <= limit_bytes:
break
delete_index(item["index"])
total -= item["size"]
print(f"remaining_log_bytes={total}")
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: opensearch-prune
namespace: logging
spec:
schedule: "23 3 * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 2
template:
spec:
restartPolicy: OnFailure
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
containers:
- name: prune
image: python:3.11-alpine
command: ["python", "/scripts/prune.py"]
env:
- name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200
- name: LOG_LIMIT_BYTES
value: "1099511627776"
- name: LOG_INDEX_PATTERNS
value: "kube-*,journald-*"
volumeMounts:
- name: scripts
mountPath: /scripts
volumes:
- name: scripts
configMap:
name: opensearch-prune-script

View File

@ -320,13 +320,6 @@ spec:
timeInterval: "15s"
uid: atlas-vm
orgId: 2
- name: Loki
type: loki
access: proxy
url: http://loki.logging.svc.cluster.local:3100
isDefault: false
uid: atlas-loki
orgId: 1
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1