logging: tune opensearch for single-node recovery
This commit is contained in:
parent
e3c05095f8
commit
939231dd6a
@ -18,6 +18,7 @@ resources:
|
||||
- oneoffs/opensearch-ism-job.yaml
|
||||
- oneoffs/opensearch-dashboards-setup-job.yaml
|
||||
- oneoffs/opensearch-observability-setup-job.yaml
|
||||
- opensearch-single-node-tune-cronjob.yaml
|
||||
- opensearch-prune-cronjob.yaml
|
||||
- fluent-bit-helmrelease.yaml
|
||||
- node-log-rotation-daemonset.yaml
|
||||
@ -52,6 +53,12 @@ configMapGenerator:
|
||||
- prune.py=scripts/opensearch_prune.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: opensearch-single-node-tune-script
|
||||
namespace: logging
|
||||
files:
|
||||
- tune.py=scripts/opensearch_single_node_tune.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: opensearch-observability-script
|
||||
namespace: logging
|
||||
files:
|
||||
|
||||
50
services/logging/opensearch-single-node-tune-cronjob.yaml
Normal file
50
services/logging/opensearch-single-node-tune-cronjob.yaml
Normal file
@ -0,0 +1,50 @@
|
||||
# services/logging/opensearch-single-node-tune-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: opensearch-single-node-tune
|
||||
namespace: logging
|
||||
spec:
|
||||
schedule: "*/30 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
containers:
|
||||
- name: tune
|
||||
image: python:3.11-alpine
|
||||
command: ["python", "/scripts/tune.py"]
|
||||
env:
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: opensearch-single-node-tune-script
|
||||
111
services/logging/scripts/opensearch_single_node_tune.py
Normal file
111
services/logging/scripts/opensearch_single_node_tune.py
Normal file
@ -0,0 +1,111 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
OS_URL = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
|
||||
|
||||
|
||||
def request_json(path, method="GET", payload=None):
|
||||
data = None
|
||||
headers = {}
|
||||
if payload is not None:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
headers["Content-Type"] = "application/json"
|
||||
request = urllib.request.Request(f"{OS_URL}{path}", data=data, headers=headers, method=method)
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
body = response.read().decode("utf-8")
|
||||
return json.loads(body) if body else {}
|
||||
|
||||
|
||||
def wait_for_opensearch():
|
||||
for _ in range(60):
|
||||
try:
|
||||
request_json("/")
|
||||
return
|
||||
except urllib.error.URLError:
|
||||
time.sleep(5)
|
||||
raise RuntimeError("OpenSearch did not become reachable")
|
||||
|
||||
|
||||
def put(path, payload):
|
||||
response = request_json(path, method="PUT", payload=payload)
|
||||
if not response.get("acknowledged", True):
|
||||
raise RuntimeError(f"OpenSearch did not acknowledge {path}: {response}")
|
||||
|
||||
|
||||
def ensure_policy(policy_id, description, min_index_age):
|
||||
payload = {
|
||||
"policy": {
|
||||
"description": description,
|
||||
"schema_version": 1,
|
||||
"default_state": "hot",
|
||||
"states": [
|
||||
{
|
||||
"name": "hot",
|
||||
"actions": [],
|
||||
"transitions": [
|
||||
{
|
||||
"state_name": "delete",
|
||||
"conditions": {"min_index_age": min_index_age},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "delete",
|
||||
"actions": [{"delete": {}}],
|
||||
"transitions": [],
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
put(f"/_plugins/_ism/policies/{policy_id}", payload)
|
||||
|
||||
|
||||
def ensure_template(name, patterns, priority, policy_id=None):
|
||||
settings = {
|
||||
"index": {
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 0,
|
||||
"refresh_interval": "30s",
|
||||
}
|
||||
}
|
||||
if policy_id:
|
||||
settings["index"]["plugins"] = {
|
||||
"index_state_management": {
|
||||
"policy_id": policy_id,
|
||||
}
|
||||
}
|
||||
payload = {
|
||||
"index_patterns": patterns,
|
||||
"priority": priority,
|
||||
"template": {
|
||||
"settings": settings,
|
||||
},
|
||||
}
|
||||
put(f"/_index_template/{name}", payload)
|
||||
|
||||
|
||||
def ensure_single_node_replicas():
|
||||
# A one-node OpenSearch cluster cannot allocate replicas; leaving them at 1
|
||||
# makes the cluster permanently yellow and keeps Data Prepper waiting.
|
||||
put("/*/_settings?expand_wildcards=all", {"index": {"number_of_replicas": 0}})
|
||||
|
||||
|
||||
def main():
|
||||
wait_for_opensearch()
|
||||
ensure_policy("logging-180d", "Delete logs after 180 days", "180d")
|
||||
ensure_policy("trace-analytics-30d", "Delete trace analytics after 30 days", "30d")
|
||||
ensure_template("kube-logs", ["kube-*"], 200, "logging-180d")
|
||||
ensure_template("journald-logs", ["journald-*"], 200, "logging-180d")
|
||||
ensure_template("trace-analytics", ["trace-analytics-*"], 200, "trace-analytics-30d")
|
||||
ensure_template("otel-v1-apm-span-index-template", ["otel-v1-apm-span-*"], 250, "trace-analytics-30d")
|
||||
ensure_template("otel-v1-apm-service-map-index-template", ["otel-v1-apm-service-map"], 250, "trace-analytics-30d")
|
||||
ensure_template("opendistro-ism-history", [".opendistro-ism-*"], 250)
|
||||
ensure_single_node_replicas()
|
||||
print("opensearch_single_node_tune_ok")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user