ariadne/ariadne/services/testing_triage.py

from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime, timezone
import json
from typing import Any

import httpx

from ..db.storage import Storage
from ..settings import settings
from ..utils.logging import get_logger
from .cluster_state import collect_cluster_state


logger = get_logger(__name__)

TRIAGE_EVENT_TYPE = "testing_triage_bundle"
_SUCCESS_STATUS = "ok|passed|success|not_applicable|skipped|na|n/a"
_JENKINS_TREE = (
    "jobs[name,url,color,lastBuild[number,result,timestamp,duration,url],"
    "lastFailedBuild[number,timestamp,url],jobs[name,url,color,"
    "lastBuild[number,result,timestamp,duration,url],lastFailedBuild[number,timestamp,url]]]"
)
_MAX_JENKINS_LOG_LINES = 80
_MAX_JENKINS_LOG_CHARS = 12000
_MAX_EVIDENCE_ITEMS = 12


@dataclass(frozen=True)
class TestingTriageSummary:
    """Represent one stored testing triage bundle.

    Inputs: bounded evidence counts gathered from Ariadne collectors.
    Outputs: compact scheduler details for metrics and audit records.
    """

    status: str
    problem_count: int
    failed_suites: list[str]


def latest_testing_triage_bundle(storage: Storage) -> dict[str, Any] | None:
    """Return the most recent stored testing triage bundle, if present."""

    rows = storage.list_events(limit=1, event_type=TRIAGE_EVENT_TYPE)
    if not rows:
        return None
    detail = rows[0].get("detail")
    if isinstance(detail, dict):
        return detail
    if isinstance(detail, str):
        try:
            payload = json.loads(detail)
        except json.JSONDecodeError:
            return None
        return payload if isinstance(payload, dict) else None
    return None


def run_testing_triage(storage: Storage) -> TestingTriageSummary:
    """Collect and store an OpenClaw-ready testing triage evidence bundle."""

    bundle = collect_testing_triage(storage)
    storage.record_event(TRIAGE_EVENT_TYPE, bundle)
    summary = bundle.get("summary") if isinstance(bundle.get("summary"), dict) else {}
    result = TestingTriageSummary(
        status=str(summary.get("status") or "unknown"),
        problem_count=int(summary.get("problem_count") or 0),
        failed_suites=[str(item) for item in summary.get("failed_suites") or []],
    )
    logger.info(
        "testing triage bundle stored",
        extra={
            "event": "testing_triage",
            "status": result.status,
            "problem_count": result.problem_count,
            "failed_suites": ",".join(result.failed_suites),
        },
    )
    return result


def collect_testing_triage(storage: Storage | None = None) -> dict[str, Any]:
    """Build a bounded evidence bundle for agentic testing triage.

    Inputs: latest persisted cluster state when available, plus deterministic
    VictoriaMetrics and Jenkins API reads.
    Outputs: JSON and Markdown evidence that OpenClaw can summarize without
    discovering cluster state from scratch.
    """

    errors: list[str] = []
    generated_at = datetime.now(timezone.utc).isoformat()
    snapshot = _latest_cluster_snapshot(storage, errors)
    quality = _quality_signals(errors)
    jenkins = _jenkins_signals(errors)
    cluster = _cluster_evidence(snapshot)
    summary = _summary(cluster, quality, jenkins, errors)
    bundle: dict[str, Any] = {
        "kind": "testing_triage_bundle",
        "generated_at": generated_at,
        "summary": summary,
        "evidence": {
            "cluster": cluster,
            "quality": quality,
            "jenkins": jenkins,
        },
        "openclaw": {
            "ariadne_latest_url": "http://ariadne.maintenance.svc.cluster.local/api/internal/testing/triage/latest",
            "instructions": [
                "Treat this bundle as the primary evidence source.",
                "Summarize root cause, blast radius, and smallest Flux/IaC change.",
                "Do not read Kubernetes Secrets or run mutating kubectl commands.",
                "Only run extra read-only commands when the bundle is stale or ambiguous.",
            ],
        },
        "unknowns": errors,
    }
    bundle["markdown"] = _render_markdown(bundle)
    return bundle


def _latest_cluster_snapshot(storage: Storage | None, errors: list[str]) -> dict[str, Any]:
    if storage is not None:
        try:
            snapshot = storage.latest_cluster_state()
            if isinstance(snapshot, dict) and snapshot:
                return snapshot
        except Exception as exc:
            errors.append(f"cluster_state_latest: {exc}")

    try:
        snapshot, _summary = collect_cluster_state()
        return snapshot
    except Exception as exc:
        errors.append(f"cluster_state_collect: {exc}")
    return {}


def _cluster_evidence(snapshot: dict[str, Any]) -> dict[str, Any]:
    summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {}
    flux = snapshot.get("flux") if isinstance(snapshot.get("flux"), dict) else {}
    pod_issues = snapshot.get("pod_issues") if isinstance(snapshot.get("pod_issues"), dict) else {}
    jobs = snapshot.get("jobs") if isinstance(snapshot.get("jobs"), dict) else {}
    events = snapshot.get("events") if isinstance(snapshot.get("events"), dict) else {}
    nodes = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
    return {
        "collected_at": snapshot.get("collected_at") or "",
        "health_bullets": _limit(summary.get("health_bullets")),
        "attention_ranked": _limit(summary.get("attention_ranked")),
        "nodes": {
            "total": nodes.get("total"),
            "ready": nodes.get("ready"),
            "not_ready": nodes.get("not_ready"),
            "not_ready_names": nodes.get("not_ready_names") or [],
        },
        "flux_not_ready": _limit(flux.get("items")),
        "pod_issues": _limit(pod_issues.get("items")),
        "pending_oldest": _limit(pod_issues.get("pending_oldest")),
        "jobs_failing": _limit(jobs.get("failing")),
        "jobs_active_oldest": _limit(jobs.get("active_oldest")),
        "events_recent": _limit(events.get("warnings_recent")),
    }


def _quality_signals(errors: list[str]) -> dict[str, Any]:
    queries = {
        "failed_runs_24h": (
            'topk(12, sum by (suite) (increase(platform_quality_gate_runs_total'
            f'{{exported_job="platform-quality-ci",status!~"{_SUCCESS_STATUS}"}}[24h])))'
        ),
        "failing_checks_24h": (
            'topk(20, sum by (suite,check,status) (increase({__name__=~".*_quality_gate_checks_total",'
            f'exported_job="platform-quality-ci",status!~"{_SUCCESS_STATUS}"}}[24h])))'
        ),
        "problem_tests_24h": (
            'topk(20, sum by (suite,test,status) (increase(platform_quality_gate_test_case_result'
            '{exported_job="platform-quality-ci",test!="",test!="__no_test_cases__",status="failed"}[24h])))'
        ),
        "jenkins_weather_failures": (
            "topk(12, max by (exported_job,job_url,weather_icon) "
            "(ariadne_jenkins_build_weather_job_last_status != 1))"
        ),
    }
    return {
        name: {
            "query": query,
            "items": _vm_items(query, errors),
        }
        for name, query in queries.items()
    }


def _vm_items(query: str, errors: list[str]) -> list[dict[str, Any]]:
    base_url = settings.vm_url.strip().rstrip("/")
    if not base_url:
        return []
    try:
        with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
            response = client.get(f"{base_url}/api/v1/query", params={"query": query})
            response.raise_for_status()
            payload = response.json()
    except Exception as exc:
        errors.append(f"victoria_metrics: {exc}")
        return []
    if payload.get("status") != "success":
        errors.append("victoria_metrics: query failed")
        return []
    result = payload.get("data", {}).get("result")
    rows = result if isinstance(result, list) else []
    return [_vm_item(row) for row in rows[:_MAX_EVIDENCE_ITEMS] if isinstance(row, dict)]


def _vm_item(row: dict[str, Any]) -> dict[str, Any]:
    metric = row.get("metric") if isinstance(row.get("metric"), dict) else {}
    value = row.get("value") if isinstance(row.get("value"), list) else []
    labels = {key: value for key, value in metric.items() if not key.startswith("__")}
    return {
        "labels": labels,
        "value": _float_value(value[1] if len(value) > 1 else None),
    }


def _jenkins_signals(errors: list[str]) -> dict[str, Any]:
    base_url = settings.jenkins_base_url.strip().rstrip("/")
    if not base_url:
        return {"failed_builds": []}
    try:
        jobs = _fetch_jenkins_jobs(base_url)
    except Exception as exc:
        errors.append(f"jenkins: {exc}")
        return {"failed_builds": []}
    failed = [job for job in jobs if job.get("status") in {"failure", "running", "unknown"}]
    failed.sort(key=lambda item: -(item.get("last_run_ts") or 0))
    for job in failed[:3]:
        _attach_jenkins_log_tail(job, errors)
    return {"failed_builds": failed[:_MAX_EVIDENCE_ITEMS]}


def _fetch_jenkins_jobs(base_url: str) -> list[dict[str, Any]]:
    auth = _jenkins_auth()
    kwargs: dict[str, Any] = {"timeout": settings.jenkins_api_timeout_sec, "follow_redirects": True}
    if auth is not None:
        kwargs["auth"] = auth
    with httpx.Client(**kwargs) as client:
        response = client.get(f"{base_url}/api/json", params={"tree": _JENKINS_TREE})
        response.raise_for_status()
        payload = response.json()
    items = payload.get("jobs") if isinstance(payload, dict) and isinstance(payload.get("jobs"), list) else []
    jobs: list[dict[str, Any]] = []
    for row in _flatten_jobs(items):
        job = _jenkins_job(row)
        if job is not None:
            jobs.append(job)
    return jobs


def _flatten_jobs(items: list[Any], prefix: str = "") -> list[dict[str, Any]]:
    output: list[dict[str, Any]] = []
    for item in items:
        if not isinstance(item, dict):
            continue
        name = item.get("name")
        if not isinstance(name, str) or not name:
            continue
        full_name = f"{prefix}/{name}" if prefix else name
        children = item.get("jobs") if isinstance(item.get("jobs"), list) else []
        if children:
            output.extend(_flatten_jobs(children, full_name))
        if isinstance(item.get("lastBuild"), dict):
            entry = dict(item)
            entry["name"] = full_name
            output.append(entry)
    return output


def _jenkins_job(raw: dict[str, Any]) -> dict[str, Any] | None:
    name = raw.get("name")
    url = raw.get("url")
    if not isinstance(name, str) or not isinstance(url, str):
        return None
    last_build = raw.get("lastBuild") if isinstance(raw.get("lastBuild"), dict) else {}
    result = str(last_build.get("result") or "").upper()
    status = _jenkins_status(raw, result)
    return {
        "job": name,
        "job_url": url,
        "status": status,
        "result": result or "UNKNOWN",
        "last_build_number": last_build.get("number"),
        "last_run_ts": _millis_to_seconds(last_build.get("timestamp")),
        "last_duration_seconds": _millis_to_seconds(last_build.get("duration")),
        "console_url": str(last_build.get("url") or url).rstrip("/") + "/consoleText",
    }


def _jenkins_status(raw: dict[str, Any], result: str) -> str:
    color = str(raw.get("color") or "").lower()
    if color.endswith("_anime"):
        return "running"
    if result == "SUCCESS":
        return "success"
    if result in {"FAILURE", "ABORTED", "UNSTABLE", "NOT_BUILT"}:
        return "failure"
    if color.startswith(("blue", "green")):
        return "success"
    if color.startswith(("red", "yellow")):
        return "failure"
    return "unknown"


def _attach_jenkins_log_tail(job: dict[str, Any], errors: list[str]) -> None:
    url = job.get("console_url")
    if not isinstance(url, str) or not url:
        return
    auth = _jenkins_auth()
    kwargs: dict[str, Any] = {"timeout": settings.jenkins_api_timeout_sec, "follow_redirects": True}
    if auth is not None:
        kwargs["auth"] = auth
    try:
        with httpx.Client(**kwargs) as client:
            response = client.get(url)
            response.raise_for_status()
            job["log_tail"] = _tail_text(response.text)
    except Exception as exc:
        errors.append(f"jenkins_log:{job.get('job')}: {exc}")


def _tail_text(text: str) -> str:
    lines = text.splitlines()[-_MAX_JENKINS_LOG_LINES:]
    tail = "\n".join(lines)
    if len(tail) <= _MAX_JENKINS_LOG_CHARS:
        return tail
    return tail[-_MAX_JENKINS_LOG_CHARS:]


def _summary(
    cluster: dict[str, Any],
    quality: dict[str, Any],
    jenkins: dict[str, Any],
    errors: list[str],
) -> dict[str, Any]:
    failed_suites = sorted(_failed_suites(quality))
    problem_count = (
        len(cluster.get("flux_not_ready") or [])
        + len(cluster.get("pod_issues") or [])
        + len(cluster.get("jobs_failing") or [])
        + len(quality.get("failed_runs_24h", {}).get("items") or [])
        + len(quality.get("failing_checks_24h", {}).get("items") or [])
        + len(jenkins.get("failed_builds") or [])
    )
    return {
        "status": "needs_attention" if problem_count or errors else "ok",
        "problem_count": problem_count,
        "failed_suites": failed_suites,
        "cluster_collected_at": cluster.get("collected_at") or "",
        "unknown_count": len(errors),
    }


def _failed_suites(quality: dict[str, Any]) -> set[str]:
    suites: set[str] = set()
    for bucket in quality.values():
        if not isinstance(bucket, dict):
            continue
        for item in bucket.get("items") or []:
            labels = item.get("labels") if isinstance(item, dict) else {}
            suite = labels.get("suite") if isinstance(labels, dict) else None
            if isinstance(suite, str) and suite:
                suites.add(suite)
    return suites


def _render_markdown(bundle: dict[str, Any]) -> str:
    summary = bundle.get("summary") if isinstance(bundle.get("summary"), dict) else {}
    evidence = bundle.get("evidence") if isinstance(bundle.get("evidence"), dict) else {}
    cluster = evidence.get("cluster") if isinstance(evidence.get("cluster"), dict) else {}
    quality = evidence.get("quality") if isinstance(evidence.get("quality"), dict) else {}
    jenkins = evidence.get("jenkins") if isinstance(evidence.get("jenkins"), dict) else {}
    lines = [
        "# Testing Triage Evidence",
        "",
        f"- Generated: {bundle.get('generated_at')}",
        f"- Status: {summary.get('status')}",
        f"- Problem count: {summary.get('problem_count')}",
        f"- Failed suites: {', '.join(summary.get('failed_suites') or []) or 'none'}",
        "",
        "## Cluster",
        *_markdown_items(cluster.get("health_bullets")),
        *_markdown_named_items("Flux", cluster.get("flux_not_ready"), "name"),
        *_markdown_named_items("Pods", cluster.get("pod_issues"), "pod"),
        "",
        "## Quality",
        *_markdown_quality(quality),
        "",
        "## Jenkins",
        *_markdown_named_items("Failed builds", jenkins.get("failed_builds"), "job"),
    ]
    unknowns = bundle.get("unknowns") if isinstance(bundle.get("unknowns"), list) else []
    if unknowns:
        lines.extend(["", "## Unknowns", *_markdown_items(unknowns)])
    return "\n".join(lines).strip() + "\n"


def _markdown_items(items: Any) -> list[str]:
    values = items if isinstance(items, list) else []
    if not values:
        return ["- none"]
    return [f"- {item}" for item in values[:_MAX_EVIDENCE_ITEMS]]


def _markdown_named_items(title: str, items: Any, key: str) -> list[str]:
    values = items if isinstance(items, list) else []
    if not values:
        return [f"- {title}: none"]
    output = []
    for item in values[:_MAX_EVIDENCE_ITEMS]:
        if not isinstance(item, dict):
            continue
        name = item.get(key) or item.get("name") or item.get("job") or "unknown"
        namespace = item.get("namespace")
        prefix = f"{namespace}/" if namespace else ""
        output.append(f"- {title}: {prefix}{name}")
    return output or [f"- {title}: none"]


def _markdown_quality(quality: dict[str, Any]) -> list[str]:
    lines: list[str] = []
    for name, bucket in quality.items():
        items = bucket.get("items") if isinstance(bucket, dict) else []
        if not items:
            lines.append(f"- {name}: none")
            continue
        for item in items[:5]:
            labels = item.get("labels") if isinstance(item, dict) else {}
            lines.append(f"- {name}: {labels} value={item.get('value')}")
    return lines


def _limit(items: Any) -> list[Any]:
    return items[:_MAX_EVIDENCE_ITEMS] if isinstance(items, list) else []


def _float_value(value: Any) -> float:
    try:
        return float(value)
    except (TypeError, ValueError):
        return 0.0


def _millis_to_seconds(value: Any) -> float:
    return _float_value(value) / 1000.0


def _jenkins_auth() -> tuple[str, str] | None:
    username = settings.jenkins_api_user.strip()
    token = settings.jenkins_api_token.strip()
    if username and token:
        return username, token
    return None