From af9ab30849620f8fa9ba5e01874e33bc872ce799 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 10 Jan 2026 00:12:55 -0300 Subject: [PATCH] logging: add trace analytics ingestion --- .../sources/helm/kustomization.yaml | 1 + .../sources/helm/opentelemetry.yaml | 9 ++ scripts/logging_render_observability.py | 2 +- .../logging/data-prepper-helmrelease.yaml | 75 ++++++++++++++++ services/logging/kustomization.yaml | 6 +- .../logging/observability/applications.json | 76 ++++++++++++---- services/logging/opensearch-ism-job.yaml | 12 ++- .../opensearch-observability-objects.yaml | 76 ++++++++++++---- .../opensearch-observability-setup-job.yaml | 2 +- .../logging/opensearch-prune-cronjob.yaml | 2 +- .../logging/otel-collector-helmrelease.yaml | 87 +++++++++++++++++++ 11 files changed, 304 insertions(+), 44 deletions(-) create mode 100644 infrastructure/sources/helm/opentelemetry.yaml create mode 100644 services/logging/data-prepper-helmrelease.yaml create mode 100644 services/logging/otel-collector-helmrelease.yaml diff --git a/infrastructure/sources/helm/kustomization.yaml b/infrastructure/sources/helm/kustomization.yaml index 97fd70e..c8d20bb 100644 --- a/infrastructure/sources/helm/kustomization.yaml +++ b/infrastructure/sources/helm/kustomization.yaml @@ -8,6 +8,7 @@ resources: - jetstack.yaml - jenkins.yaml - mailu.yaml + - opentelemetry.yaml - opensearch.yaml - harbor.yaml - prometheus.yaml diff --git a/infrastructure/sources/helm/opentelemetry.yaml b/infrastructure/sources/helm/opentelemetry.yaml new file mode 100644 index 0000000..03d0b00 --- /dev/null +++ b/infrastructure/sources/helm/opentelemetry.yaml @@ -0,0 +1,9 @@ +# infrastructure/sources/helm/opentelemetry.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opentelemetry + namespace: flux-system +spec: + interval: 1h + url: https://open-telemetry.github.io/opentelemetry-helm-charts diff --git a/scripts/logging_render_observability.py b/scripts/logging_render_observability.py index bd3455d..679e340 100755 --- a/scripts/logging_render_observability.py +++ b/scripts/logging_render_observability.py @@ -180,7 +180,7 @@ def build_objects() -> tuple[list[dict], list[dict], list[dict]]: "description": app.description, "baseQuery": app.base_query, "servicesEntities": [], - "traceGroups": [], + "traceGroups": [app.name], } for app in apps ] diff --git a/services/logging/data-prepper-helmrelease.yaml b/services/logging/data-prepper-helmrelease.yaml new file mode 100644 index 0000000..900b9ca --- /dev/null +++ b/services/logging/data-prepper-helmrelease.yaml @@ -0,0 +1,75 @@ +# services/logging/data-prepper-helmrelease.yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: data-prepper + namespace: logging +spec: + interval: 15m + timeout: 10m + chart: + spec: + chart: data-prepper + version: "~0.3.1" + sourceRef: + kind: HelmRepository + name: opensearch + namespace: flux-system + values: + fullnameOverride: data-prepper + replicaCount: 1 + config: + data-prepper-config.yaml: | + ssl: false + pipelineConfig: + enabled: true + config: + entry-pipeline: + delay: "100" + source: + otel_trace_source: + ssl: false + sink: + - pipeline: + name: "raw-pipeline" + - pipeline: + name: "service-map-pipeline" + raw-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - otel_traces: + sink: + - opensearch: + hosts: ["http://opensearch-master.logging.svc.cluster.local:9200"] + index_type: trace-analytics-raw + service-map-pipeline: + delay: "100" + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map: + sink: + - opensearch: + hosts: ["http://opensearch-master.logging.svc.cluster.local:9200"] + index_type: trace-analytics-service-map + resources: + requests: + cpu: "200m" + memory: "512Mi" + limits: + memory: "1Gi" + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5 diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index 94fc1a8..a4e0bab 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -3,11 +3,13 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - opensearch-dashboards-objects.yaml + - opensearch-observability-objects.yaml - opensearch-pvc.yaml - opensearch-helmrelease.yaml - opensearch-dashboards-helmrelease.yaml - - opensearch-dashboards-objects.yaml - - opensearch-observability-objects.yaml + - data-prepper-helmrelease.yaml + - otel-collector-helmrelease.yaml - opensearch-ism-job.yaml - opensearch-dashboards-setup-job.yaml - opensearch-observability-setup-job.yaml diff --git a/services/logging/observability/applications.json b/services/logging/observability/applications.json index fc29e5a..8a0b397 100644 --- a/services/logging/observability/applications.json +++ b/services/logging/observability/applications.json @@ -4,132 +4,170 @@ "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'bstein-dev-home'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "bstein-dev-home" + ] }, { "name": "pegasus", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "pegasus" + ] }, { "name": "jellyfin", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "jellyfin" + ] }, { "name": "vaultwarden", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vaultwarden'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "vaultwarden" + ] }, { "name": "mailu", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'mailu-mailserver'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "mailu" + ] }, { "name": "nextcloud", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'nextcloud'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "nextcloud" + ] }, { "name": "gitea", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'gitea'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "gitea" + ] }, { "name": "jenkins", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jenkins'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "jenkins" + ] }, { "name": "harbor", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'harbor'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "harbor" + ] }, { "name": "vault", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vault'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "vault" + ] }, { "name": "keycloak", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'sso'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "keycloak" + ] }, { "name": "flux-system", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'flux-system'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "flux-system" + ] }, { "name": "comms", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "comms" + ] }, { "name": "element-web", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "element-web" + ] }, { "name": "element-call", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "element-call" + ] }, { "name": "matrix-synapse", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "matrix-synapse" + ] }, { "name": "livekit", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "livekit" + ] }, { "name": "coturn", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "coturn" + ] }, { "name": "lesavka", "description": "", "baseQuery": "source = journald-* | where _HOSTNAME = 'titan-jh'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "lesavka" + ] } ] diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/opensearch-ism-job.yaml index c800677..3313571 100644 --- a/services/logging/opensearch-ism-job.yaml +++ b/services/logging/opensearch-ism-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: opensearch-ism-setup-4 + name: opensearch-ism-setup-5 namespace: logging spec: backoffLimit: 3 @@ -48,6 +48,11 @@ spec: -H 'Content-Type: application/json' \ -d "${policy}" >/dev/null + trace_policy='{"policy":{"description":"Delete trace analytics after 30 days","schema_version":1,"default_state":"hot","states":[{"name":"hot","actions":[],"transitions":[{"state_name":"delete","conditions":{"min_index_age":"30d"}}]},{"name":"delete","actions":[{"delete":{}}],"transitions":[]}]}}' + curl -sS -X PUT "${OS_URL}/_plugins/_ism/policies/trace-analytics-30d" \ + -H 'Content-Type: application/json' \ + -d "${trace_policy}" >/dev/null + kube_template='{"index_patterns":["kube-*"],"priority":200,"template":{"settings":{"index.number_of_shards":1,"index.number_of_replicas":0,"index.refresh_interval":"30s","plugins.index_state_management.policy_id":"logging-180d"},"mappings":{"properties":{"@timestamp":{"type":"date"}}}}}' curl -sS -X PUT "${OS_URL}/_index_template/kube-logs" \ -H 'Content-Type: application/json' \ @@ -58,6 +63,11 @@ spec: -H 'Content-Type: application/json' \ -d "${journal_template}" >/dev/null + trace_template='{"index_patterns":["trace-analytics-*"],"priority":200,"template":{"settings":{"index.number_of_shards":1,"index.number_of_replicas":0,"index.refresh_interval":"30s","plugins.index_state_management.policy_id":"trace-analytics-30d"}}}' + curl -sS -X PUT "${OS_URL}/_index_template/trace-analytics" \ + -H 'Content-Type: application/json' \ + -d "${trace_template}" >/dev/null + curl -sS -X PUT "${OS_URL}/_all/_settings" \ -H 'Content-Type: application/json' \ -d '{"index":{"number_of_replicas":0}}' >/dev/null diff --git a/services/logging/opensearch-observability-objects.yaml b/services/logging/opensearch-observability-objects.yaml index 0fd31c5..19ed195 100644 --- a/services/logging/opensearch-observability-objects.yaml +++ b/services/logging/opensearch-observability-objects.yaml @@ -13,133 +13,171 @@ data: "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'bstein-dev-home'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "bstein-dev-home" + ] }, { "name": "pegasus", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "pegasus" + ] }, { "name": "jellyfin", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "jellyfin" + ] }, { "name": "vaultwarden", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vaultwarden'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "vaultwarden" + ] }, { "name": "mailu", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'mailu-mailserver'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "mailu" + ] }, { "name": "nextcloud", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'nextcloud'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "nextcloud" + ] }, { "name": "gitea", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'gitea'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "gitea" + ] }, { "name": "jenkins", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'jenkins'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "jenkins" + ] }, { "name": "harbor", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'harbor'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "harbor" + ] }, { "name": "vault", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'vault'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "vault" + ] }, { "name": "keycloak", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'sso'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "keycloak" + ] }, { "name": "flux-system", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'flux-system'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "flux-system" + ] }, { "name": "comms", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "comms" + ] }, { "name": "element-web", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "element-web" + ] }, { "name": "element-call", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "element-call" + ] }, { "name": "matrix-synapse", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "matrix-synapse" + ] }, { "name": "livekit", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "livekit" + ] }, { "name": "coturn", "description": "", "baseQuery": "source = kube-* | where kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "coturn" + ] }, { "name": "lesavka", "description": "", "baseQuery": "source = journald-* | where _HOSTNAME = 'titan-jh'", "servicesEntities": [], - "traceGroups": [] + "traceGroups": [ + "lesavka" + ] } ] saved_queries.json: | diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/opensearch-observability-setup-job.yaml index cf96878..75e65b2 100644 --- a/services/logging/opensearch-observability-setup-job.yaml +++ b/services/logging/opensearch-observability-setup-job.yaml @@ -150,7 +150,7 @@ data: apiVersion: batch/v1 kind: Job metadata: - name: opensearch-observability-setup-1 + name: opensearch-observability-setup-2 namespace: logging spec: backoffLimit: 3 diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml index 74e2837..83aee1a 100644 --- a/services/logging/opensearch-prune-cronjob.yaml +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -122,7 +122,7 @@ spec: - name: LOG_LIMIT_BYTES value: "1099511627776" - name: LOG_INDEX_PATTERNS - value: "kube-*,journald-*" + value: "kube-*,journald-*,trace-analytics-*" volumeMounts: - name: scripts mountPath: /scripts diff --git a/services/logging/otel-collector-helmrelease.yaml b/services/logging/otel-collector-helmrelease.yaml new file mode 100644 index 0000000..b6346cc --- /dev/null +++ b/services/logging/otel-collector-helmrelease.yaml @@ -0,0 +1,87 @@ +# services/logging/otel-collector-helmrelease.yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: otel-collector + namespace: logging +spec: + interval: 15m + timeout: 10m + chart: + spec: + chart: opentelemetry-collector + version: "~0.143.0" + sourceRef: + kind: HelmRepository + name: opentelemetry + namespace: flux-system + values: + fullnameOverride: otel-collector + mode: deployment + replicaCount: 1 + ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + protocol: TCP + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + protocol: TCP + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false + config: + receivers: + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + batch: {} + exporters: + otlp/data-prepper: + endpoint: data-prepper.logging.svc.cluster.local:21890 + tls: + insecure: true + service: + extensions: + - health_check + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/data-prepper] + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + memory: "512Mi" + nodeSelector: + node-role.kubernetes.io/worker: "true" + hardware: rpi5 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi5