logging: remove loki and backfill to opensearch
This commit is contained in:
parent
456677cfbb
commit
0b78ec663d
@ -33,6 +33,10 @@ spec:
|
||||
- name: varlogjournal
|
||||
hostPath:
|
||||
path: /var/log/journal
|
||||
- name: fluentbit-state
|
||||
hostPath:
|
||||
path: /var/lib/fluent-bit
|
||||
type: DirectoryOrCreate
|
||||
extraVolumeMounts:
|
||||
- name: runlogjournal
|
||||
mountPath: /run/log/journal
|
||||
@ -40,6 +44,8 @@ spec:
|
||||
- name: varlogjournal
|
||||
mountPath: /var/log/journal
|
||||
readOnly: true
|
||||
- name: fluentbit-state
|
||||
mountPath: /var/lib/fluent-bit
|
||||
config:
|
||||
service: |
|
||||
[SERVICE]
|
||||
@ -51,6 +57,10 @@ spec:
|
||||
HTTP_Server On
|
||||
HTTP_Listen 0.0.0.0
|
||||
HTTP_Port 2020
|
||||
storage.path /var/lib/fluent-bit/storage
|
||||
storage.sync normal
|
||||
storage.checksum on
|
||||
storage.backlog.mem_limit 50M
|
||||
inputs: |
|
||||
[INPUT]
|
||||
Name tail
|
||||
@ -63,14 +73,17 @@ spec:
|
||||
Refresh_Interval 10
|
||||
Rotate_Wait 30
|
||||
Inotify_Watcher false
|
||||
storage.type memory
|
||||
Read_from_Head On
|
||||
DB /var/lib/fluent-bit/kube.db
|
||||
storage.type filesystem
|
||||
|
||||
[INPUT]
|
||||
Name systemd
|
||||
Tag journald.*
|
||||
Path /var/log/journal
|
||||
Read_From_Tail On
|
||||
storage.type memory
|
||||
Read_From_Tail Off
|
||||
DB /var/lib/fluent-bit/systemd.db
|
||||
storage.type filesystem
|
||||
filters: |
|
||||
[FILTER]
|
||||
Name kubernetes
|
||||
|
||||
@ -6,7 +6,8 @@ resources:
|
||||
- opensearch-helmrelease.yaml
|
||||
- opensearch-dashboards-helmrelease.yaml
|
||||
- opensearch-ism-job.yaml
|
||||
- opensearch-dashboards-setup-job.yaml
|
||||
- opensearch-prune-cronjob.yaml
|
||||
- fluent-bit-helmrelease.yaml
|
||||
- loki-helmrelease.yaml
|
||||
- oauth2-proxy.yaml
|
||||
- ingress.yaml
|
||||
|
||||
@ -1,113 +0,0 @@
|
||||
# services/logging/loki-helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: logging
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: loki
|
||||
version: "~6.6.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana
|
||||
namespace: flux-system
|
||||
values:
|
||||
fullnameOverride: loki
|
||||
deploymentMode: SingleBinary
|
||||
loki:
|
||||
auth_enabled: false
|
||||
commonConfig:
|
||||
replication_factor: 1
|
||||
storage:
|
||||
type: filesystem
|
||||
storageConfig:
|
||||
filesystem:
|
||||
directory: /var/loki/chunks
|
||||
tsdb_shipper:
|
||||
active_index_directory: /var/loki/index
|
||||
cache_location: /var/loki/index_cache
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: "2024-01-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
compactor:
|
||||
working_directory: /var/loki/compactor
|
||||
retention_enabled: true
|
||||
delete_request_store: filesystem
|
||||
limits_config:
|
||||
retention_period: 4320h
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
read:
|
||||
replicas: 0
|
||||
write:
|
||||
replicas: 0
|
||||
backend:
|
||||
replicas: 0
|
||||
singleBinary:
|
||||
replicas: 1
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 200Gi
|
||||
storageClass: asteria
|
||||
gateway:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
chunksCache:
|
||||
allocatedMemory: 512
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
resultsCache:
|
||||
allocatedMemory: 256
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
- rpi4
|
||||
lokiCanary:
|
||||
nodeSelector:
|
||||
hardware: rpi5
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
service:
|
||||
type: ClusterIP
|
||||
ingress:
|
||||
enabled: false
|
||||
@ -55,6 +55,7 @@ spec:
|
||||
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
||||
- --scope=openid profile email
|
||||
- --email-domain=*
|
||||
- --code-challenge-method=S256
|
||||
- --set-xauthrequest=true
|
||||
- --pass-access-token=true
|
||||
- --set-authorization-header=true
|
||||
|
||||
63
services/logging/opensearch-dashboards-setup-job.yaml
Normal file
63
services/logging/opensearch-dashboards-setup-job.yaml
Normal file
@ -0,0 +1,63 @@
|
||||
# services/logging/opensearch-dashboards-setup-job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: opensearch-dashboards-setup-1
|
||||
namespace: logging
|
||||
spec:
|
||||
backoffLimit: 3
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
containers:
|
||||
- name: setup
|
||||
image: alpine:3.20
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -euo pipefail
|
||||
apk add --no-cache curl >/dev/null
|
||||
|
||||
OSD_URL="http://opensearch-dashboards.logging.svc.cluster.local:5601"
|
||||
for attempt in $(seq 1 60); do
|
||||
code="$(curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" || true)"
|
||||
if [ "${code}" = "200" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if ! curl -s -o /dev/null -w "%{http_code}" "${OSD_URL}/api/status" | grep -q "200"; then
|
||||
echo "OpenSearch Dashboards did not become ready in time" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
create_view() {
|
||||
view_id="$1"
|
||||
title="$2"
|
||||
curl -sS -X POST "${OSD_URL}/api/saved_objects/index-pattern/${view_id}?overwrite=true" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'osd-xsrf: true' \
|
||||
-d "{\"attributes\":{\"title\":\"${title}\",\"timeFieldName\":\"@timestamp\"}}" >/dev/null
|
||||
}
|
||||
|
||||
create_view kube-logs "kube-*"
|
||||
create_view journald-logs "journald-*"
|
||||
|
||||
curl -sS -X POST "${OSD_URL}/api/opensearch-dashboards/settings" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'osd-xsrf: true' \
|
||||
-d '{"changes":{"defaultIndex":"kube-logs"}}' >/dev/null
|
||||
@ -32,7 +32,7 @@ spec:
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClass: asteria
|
||||
size: 500Gi
|
||||
size: 1024Gi
|
||||
config:
|
||||
opensearch.yml: |
|
||||
cluster.name: opensearch
|
||||
|
||||
132
services/logging/opensearch-prune-cronjob.yaml
Normal file
132
services/logging/opensearch-prune-cronjob.yaml
Normal file
@ -0,0 +1,132 @@
|
||||
# services/logging/opensearch-prune-cronjob.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: opensearch-prune-script
|
||||
namespace: logging
|
||||
data:
|
||||
prune.py: |
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
|
||||
limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
|
||||
patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
|
||||
|
||||
UNITS = {
|
||||
"b": 1,
|
||||
"kb": 1024,
|
||||
"mb": 1024**2,
|
||||
"gb": 1024**3,
|
||||
"tb": 1024**4,
|
||||
}
|
||||
|
||||
def parse_size(value: str) -> int:
|
||||
if not value:
|
||||
return 0
|
||||
text = value.strip().lower()
|
||||
if text in ("-", "0"):
|
||||
return 0
|
||||
match = re.match(r"^([0-9.]+)([a-z]+)$", text)
|
||||
if not match:
|
||||
return 0
|
||||
number = float(match.group(1))
|
||||
unit = match.group(2)
|
||||
if unit not in UNITS:
|
||||
return 0
|
||||
return int(number * UNITS[unit])
|
||||
|
||||
def request_json(path: str):
|
||||
url = f"{os_url}{path}"
|
||||
with urllib.request.urlopen(url, timeout=30) as response:
|
||||
payload = response.read().decode("utf-8")
|
||||
return json.loads(payload)
|
||||
|
||||
def delete_index(index: str) -> None:
|
||||
url = f"{os_url}/{index}"
|
||||
req = urllib.request.Request(url, method="DELETE")
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
_ = response.read()
|
||||
print(f"deleted {index}")
|
||||
|
||||
indices = []
|
||||
for pattern in patterns:
|
||||
try:
|
||||
data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
continue
|
||||
raise
|
||||
for item in data:
|
||||
index = item.get("index")
|
||||
if not index or index.startswith("."):
|
||||
continue
|
||||
size = parse_size(item.get("store.size", ""))
|
||||
created = int(item.get("creation.date", "0") or 0)
|
||||
indices.append({"index": index, "size": size, "created": created})
|
||||
|
||||
total = sum(item["size"] for item in indices)
|
||||
print(f"total_log_bytes={total}")
|
||||
if total <= limit_bytes:
|
||||
print("within limit")
|
||||
sys.exit(0)
|
||||
|
||||
indices.sort(key=lambda item: item["created"])
|
||||
for item in indices:
|
||||
if total <= limit_bytes:
|
||||
break
|
||||
delete_index(item["index"])
|
||||
total -= item["size"]
|
||||
|
||||
print(f"remaining_log_bytes={total}")
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: opensearch-prune
|
||||
namespace: logging
|
||||
spec:
|
||||
schedule: "23 3 * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
containers:
|
||||
- name: prune
|
||||
image: python:3.11-alpine
|
||||
command: ["python", "/scripts/prune.py"]
|
||||
env:
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
- name: LOG_LIMIT_BYTES
|
||||
value: "1099511627776"
|
||||
- name: LOG_INDEX_PATTERNS
|
||||
value: "kube-*,journald-*"
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: opensearch-prune-script
|
||||
@ -320,13 +320,6 @@ spec:
|
||||
timeInterval: "15s"
|
||||
uid: atlas-vm
|
||||
orgId: 2
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki.logging.svc.cluster.local:3100
|
||||
isDefault: false
|
||||
uid: atlas-loki
|
||||
orgId: 1
|
||||
dashboardProviders:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user