ariadne/ariadne/services/testing_triage.py

462 lines
17 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import json
from typing import Any
import httpx
from ..db.storage import Storage
from ..settings import settings
from ..utils.logging import get_logger
from .cluster_state import collect_cluster_state
logger = get_logger(__name__)
TRIAGE_EVENT_TYPE = "testing_triage_bundle"
_SUCCESS_STATUS = "ok|passed|success|not_applicable|skipped|na|n/a"
_JENKINS_TREE = (
"jobs[name,url,color,lastBuild[number,result,timestamp,duration,url],"
"lastFailedBuild[number,timestamp,url],jobs[name,url,color,"
"lastBuild[number,result,timestamp,duration,url],lastFailedBuild[number,timestamp,url]]]"
)
_MAX_JENKINS_LOG_LINES = 80
_MAX_JENKINS_LOG_CHARS = 12000
_MAX_EVIDENCE_ITEMS = 12
@dataclass(frozen=True)
class TestingTriageSummary:
"""Represent one stored testing triage bundle.
Inputs: bounded evidence counts gathered from Ariadne collectors.
Outputs: compact scheduler details for metrics and audit records.
"""
status: str
problem_count: int
failed_suites: list[str]
def latest_testing_triage_bundle(storage: Storage) -> dict[str, Any] | None:
"""Return the most recent stored testing triage bundle, if present."""
rows = storage.list_events(limit=1, event_type=TRIAGE_EVENT_TYPE)
if not rows:
return None
detail = rows[0].get("detail")
if isinstance(detail, dict):
return detail
if isinstance(detail, str):
try:
payload = json.loads(detail)
except json.JSONDecodeError:
return None
return payload if isinstance(payload, dict) else None
return None
def run_testing_triage(storage: Storage) -> TestingTriageSummary:
"""Collect and store an OpenClaw-ready testing triage evidence bundle."""
bundle = collect_testing_triage(storage)
storage.record_event(TRIAGE_EVENT_TYPE, bundle)
summary = bundle.get("summary") if isinstance(bundle.get("summary"), dict) else {}
result = TestingTriageSummary(
status=str(summary.get("status") or "unknown"),
problem_count=int(summary.get("problem_count") or 0),
failed_suites=[str(item) for item in summary.get("failed_suites") or []],
)
logger.info(
"testing triage bundle stored",
extra={
"event": "testing_triage",
"status": result.status,
"problem_count": result.problem_count,
"failed_suites": ",".join(result.failed_suites),
},
)
return result
def collect_testing_triage(storage: Storage | None = None) -> dict[str, Any]:
"""Build a bounded evidence bundle for agentic testing triage.
Inputs: latest persisted cluster state when available, plus deterministic
VictoriaMetrics and Jenkins API reads.
Outputs: JSON and Markdown evidence that OpenClaw can summarize without
discovering cluster state from scratch.
"""
errors: list[str] = []
generated_at = datetime.now(timezone.utc).isoformat()
snapshot = _latest_cluster_snapshot(storage, errors)
quality = _quality_signals(errors)
jenkins = _jenkins_signals(errors)
cluster = _cluster_evidence(snapshot)
summary = _summary(cluster, quality, jenkins, errors)
bundle: dict[str, Any] = {
"kind": "testing_triage_bundle",
"generated_at": generated_at,
"summary": summary,
"evidence": {
"cluster": cluster,
"quality": quality,
"jenkins": jenkins,
},
"openclaw": {
"ariadne_latest_url": "http://ariadne.maintenance.svc.cluster.local/api/internal/testing/triage/latest",
"instructions": [
"Treat this bundle as the primary evidence source.",
"Summarize root cause, blast radius, and smallest Flux/IaC change.",
"Do not read Kubernetes Secrets or run mutating kubectl commands.",
"Only run extra read-only commands when the bundle is stale or ambiguous.",
],
},
"unknowns": errors,
}
bundle["markdown"] = _render_markdown(bundle)
return bundle
def _latest_cluster_snapshot(storage: Storage | None, errors: list[str]) -> dict[str, Any]:
if storage is not None:
try:
snapshot = storage.latest_cluster_state()
if isinstance(snapshot, dict) and snapshot:
return snapshot
except Exception as exc:
errors.append(f"cluster_state_latest: {exc}")
try:
snapshot, _summary = collect_cluster_state()
return snapshot
except Exception as exc:
errors.append(f"cluster_state_collect: {exc}")
return {}
def _cluster_evidence(snapshot: dict[str, Any]) -> dict[str, Any]:
summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {}
flux = snapshot.get("flux") if isinstance(snapshot.get("flux"), dict) else {}
pod_issues = snapshot.get("pod_issues") if isinstance(snapshot.get("pod_issues"), dict) else {}
jobs = snapshot.get("jobs") if isinstance(snapshot.get("jobs"), dict) else {}
events = snapshot.get("events") if isinstance(snapshot.get("events"), dict) else {}
nodes = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
return {
"collected_at": snapshot.get("collected_at") or "",
"health_bullets": _limit(summary.get("health_bullets")),
"attention_ranked": _limit(summary.get("attention_ranked")),
"nodes": {
"total": nodes.get("total"),
"ready": nodes.get("ready"),
"not_ready": nodes.get("not_ready"),
"not_ready_names": nodes.get("not_ready_names") or [],
},
"flux_not_ready": _limit(flux.get("items")),
"pod_issues": _limit(pod_issues.get("items")),
"pending_oldest": _limit(pod_issues.get("pending_oldest")),
"jobs_failing": _limit(jobs.get("failing")),
"jobs_active_oldest": _limit(jobs.get("active_oldest")),
"events_recent": _limit(events.get("warnings_recent")),
}
def _quality_signals(errors: list[str]) -> dict[str, Any]:
queries = {
"failed_runs_24h": (
'topk(12, sum by (suite) (increase(platform_quality_gate_runs_total'
f'{{exported_job="platform-quality-ci",status!~"{_SUCCESS_STATUS}"}}[24h])))'
),
"failing_checks_24h": (
'topk(20, sum by (suite,check,status) (increase({__name__=~".*_quality_gate_checks_total",'
f'exported_job="platform-quality-ci",status!~"{_SUCCESS_STATUS}"}}[24h])))'
),
"problem_tests_24h": (
'topk(20, sum by (suite,test,status) (increase(platform_quality_gate_test_case_result'
'{exported_job="platform-quality-ci",test!="",test!="__no_test_cases__",status="failed"}[24h])))'
),
"jenkins_weather_failures": (
"topk(12, max by (exported_job,job_url,weather_icon) "
"(ariadne_jenkins_build_weather_job_last_status != 1))"
),
}
return {
name: {
"query": query,
"items": _vm_items(query, errors),
}
for name, query in queries.items()
}
def _vm_items(query: str, errors: list[str]) -> list[dict[str, Any]]:
base_url = settings.vm_url.strip().rstrip("/")
if not base_url:
return []
try:
with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
response = client.get(f"{base_url}/api/v1/query", params={"query": query})
response.raise_for_status()
payload = response.json()
except Exception as exc:
errors.append(f"victoria_metrics: {exc}")
return []
if payload.get("status") != "success":
errors.append("victoria_metrics: query failed")
return []
result = payload.get("data", {}).get("result")
rows = result if isinstance(result, list) else []
return [_vm_item(row) for row in rows[:_MAX_EVIDENCE_ITEMS] if isinstance(row, dict)]
def _vm_item(row: dict[str, Any]) -> dict[str, Any]:
metric = row.get("metric") if isinstance(row.get("metric"), dict) else {}
value = row.get("value") if isinstance(row.get("value"), list) else []
labels = {key: value for key, value in metric.items() if not key.startswith("__")}
return {
"labels": labels,
"value": _float_value(value[1] if len(value) > 1 else None),
}
def _jenkins_signals(errors: list[str]) -> dict[str, Any]:
base_url = settings.jenkins_base_url.strip().rstrip("/")
if not base_url:
return {"failed_builds": []}
try:
jobs = _fetch_jenkins_jobs(base_url)
except Exception as exc:
errors.append(f"jenkins: {exc}")
return {"failed_builds": []}
failed = [job for job in jobs if job.get("status") in {"failure", "running", "unknown"}]
failed.sort(key=lambda item: -(item.get("last_run_ts") or 0))
for job in failed[:3]:
_attach_jenkins_log_tail(job, errors)
return {"failed_builds": failed[:_MAX_EVIDENCE_ITEMS]}
def _fetch_jenkins_jobs(base_url: str) -> list[dict[str, Any]]:
auth = _jenkins_auth()
kwargs: dict[str, Any] = {"timeout": settings.jenkins_api_timeout_sec, "follow_redirects": True}
if auth is not None:
kwargs["auth"] = auth
with httpx.Client(**kwargs) as client:
response = client.get(f"{base_url}/api/json", params={"tree": _JENKINS_TREE})
response.raise_for_status()
payload = response.json()
items = payload.get("jobs") if isinstance(payload, dict) and isinstance(payload.get("jobs"), list) else []
jobs: list[dict[str, Any]] = []
for row in _flatten_jobs(items):
job = _jenkins_job(row)
if job is not None:
jobs.append(job)
return jobs
def _flatten_jobs(items: list[Any], prefix: str = "") -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in items:
if not isinstance(item, dict):
continue
name = item.get("name")
if not isinstance(name, str) or not name:
continue
full_name = f"{prefix}/{name}" if prefix else name
children = item.get("jobs") if isinstance(item.get("jobs"), list) else []
if children:
output.extend(_flatten_jobs(children, full_name))
if isinstance(item.get("lastBuild"), dict):
entry = dict(item)
entry["name"] = full_name
output.append(entry)
return output
def _jenkins_job(raw: dict[str, Any]) -> dict[str, Any] | None:
name = raw.get("name")
url = raw.get("url")
if not isinstance(name, str) or not isinstance(url, str):
return None
last_build = raw.get("lastBuild") if isinstance(raw.get("lastBuild"), dict) else {}
result = str(last_build.get("result") or "").upper()
status = _jenkins_status(raw, result)
return {
"job": name,
"job_url": url,
"status": status,
"result": result or "UNKNOWN",
"last_build_number": last_build.get("number"),
"last_run_ts": _millis_to_seconds(last_build.get("timestamp")),
"last_duration_seconds": _millis_to_seconds(last_build.get("duration")),
"console_url": str(last_build.get("url") or url).rstrip("/") + "/consoleText",
}
def _jenkins_status(raw: dict[str, Any], result: str) -> str:
color = str(raw.get("color") or "").lower()
if color.endswith("_anime"):
return "running"
if result == "SUCCESS":
return "success"
if result in {"FAILURE", "ABORTED", "UNSTABLE", "NOT_BUILT"}:
return "failure"
if color.startswith(("blue", "green")):
return "success"
if color.startswith(("red", "yellow")):
return "failure"
return "unknown"
def _attach_jenkins_log_tail(job: dict[str, Any], errors: list[str]) -> None:
url = job.get("console_url")
if not isinstance(url, str) or not url:
return
auth = _jenkins_auth()
kwargs: dict[str, Any] = {"timeout": settings.jenkins_api_timeout_sec, "follow_redirects": True}
if auth is not None:
kwargs["auth"] = auth
try:
with httpx.Client(**kwargs) as client:
response = client.get(url)
response.raise_for_status()
job["log_tail"] = _tail_text(response.text)
except Exception as exc:
errors.append(f"jenkins_log:{job.get('job')}: {exc}")
def _tail_text(text: str) -> str:
lines = text.splitlines()[-_MAX_JENKINS_LOG_LINES:]
tail = "\n".join(lines)
if len(tail) <= _MAX_JENKINS_LOG_CHARS:
return tail
return tail[-_MAX_JENKINS_LOG_CHARS:]
def _summary(
cluster: dict[str, Any],
quality: dict[str, Any],
jenkins: dict[str, Any],
errors: list[str],
) -> dict[str, Any]:
failed_suites = sorted(_failed_suites(quality))
problem_count = (
len(cluster.get("flux_not_ready") or [])
+ len(cluster.get("pod_issues") or [])
+ len(cluster.get("jobs_failing") or [])
+ len(quality.get("failed_runs_24h", {}).get("items") or [])
+ len(quality.get("failing_checks_24h", {}).get("items") or [])
+ len(jenkins.get("failed_builds") or [])
)
return {
"status": "needs_attention" if problem_count or errors else "ok",
"problem_count": problem_count,
"failed_suites": failed_suites,
"cluster_collected_at": cluster.get("collected_at") or "",
"unknown_count": len(errors),
}
def _failed_suites(quality: dict[str, Any]) -> set[str]:
suites: set[str] = set()
for bucket in quality.values():
if not isinstance(bucket, dict):
continue
for item in bucket.get("items") or []:
labels = item.get("labels") if isinstance(item, dict) else {}
suite = labels.get("suite") if isinstance(labels, dict) else None
if isinstance(suite, str) and suite:
suites.add(suite)
return suites
def _render_markdown(bundle: dict[str, Any]) -> str:
summary = bundle.get("summary") if isinstance(bundle.get("summary"), dict) else {}
evidence = bundle.get("evidence") if isinstance(bundle.get("evidence"), dict) else {}
cluster = evidence.get("cluster") if isinstance(evidence.get("cluster"), dict) else {}
quality = evidence.get("quality") if isinstance(evidence.get("quality"), dict) else {}
jenkins = evidence.get("jenkins") if isinstance(evidence.get("jenkins"), dict) else {}
lines = [
"# Testing Triage Evidence",
"",
f"- Generated: {bundle.get('generated_at')}",
f"- Status: {summary.get('status')}",
f"- Problem count: {summary.get('problem_count')}",
f"- Failed suites: {', '.join(summary.get('failed_suites') or []) or 'none'}",
"",
"## Cluster",
*_markdown_items(cluster.get("health_bullets")),
*_markdown_named_items("Flux", cluster.get("flux_not_ready"), "name"),
*_markdown_named_items("Pods", cluster.get("pod_issues"), "pod"),
"",
"## Quality",
*_markdown_quality(quality),
"",
"## Jenkins",
*_markdown_named_items("Failed builds", jenkins.get("failed_builds"), "job"),
]
unknowns = bundle.get("unknowns") if isinstance(bundle.get("unknowns"), list) else []
if unknowns:
lines.extend(["", "## Unknowns", *_markdown_items(unknowns)])
return "\n".join(lines).strip() + "\n"
def _markdown_items(items: Any) -> list[str]:
values = items if isinstance(items, list) else []
if not values:
return ["- none"]
return [f"- {item}" for item in values[:_MAX_EVIDENCE_ITEMS]]
def _markdown_named_items(title: str, items: Any, key: str) -> list[str]:
values = items if isinstance(items, list) else []
if not values:
return [f"- {title}: none"]
output = []
for item in values[:_MAX_EVIDENCE_ITEMS]:
if not isinstance(item, dict):
continue
name = item.get(key) or item.get("name") or item.get("job") or "unknown"
namespace = item.get("namespace")
prefix = f"{namespace}/" if namespace else ""
output.append(f"- {title}: {prefix}{name}")
return output or [f"- {title}: none"]
def _markdown_quality(quality: dict[str, Any]) -> list[str]:
lines: list[str] = []
for name, bucket in quality.items():
items = bucket.get("items") if isinstance(bucket, dict) else []
if not items:
lines.append(f"- {name}: none")
continue
for item in items[:5]:
labels = item.get("labels") if isinstance(item, dict) else {}
lines.append(f"- {name}: {labels} value={item.get('value')}")
return lines
def _limit(items: Any) -> list[Any]:
return items[:_MAX_EVIDENCE_ITEMS] if isinstance(items, list) else []
def _float_value(value: Any) -> float:
try:
return float(value)
except (TypeError, ValueError):
return 0.0
def _millis_to_seconds(value: Any) -> float:
return _float_value(value) / 1000.0
def _jenkins_auth() -> tuple[str, str] | None:
username = settings.jenkins_api_user.strip()
token = settings.jenkins_api_token.strip()
if username and token:
return username, token
return None