monitoring(testing): add per-test history panels and metric emitter

2026-04-20 12:03:27 -03:00 · 2026-04-20 12:03:27 -03:00 · c3cca8ad9a
commit c3cca8ad9a
parent 9103cd22f2
5 changed files with 5790 additions and 1006 deletions
--- a/ci/scripts/publish_test_metrics.py
+++ b/ci/scripts/publish_test_metrics.py
@ -29,11 +29,11 @@ def _read_text(url: str) -> str:


 def _post_text(url: str, payload: str) -> None:
-    """POST a plain-text payload and fail on any 4xx/5xx response."""
+    """PUT a plain-text payload and fail on any 4xx/5xx response."""
    request = urllib.request.Request(
        url,
        data=payload.encode("utf-8"),
-        method="POST",
+        method="PUT",
        headers={"Content-Type": "text/plain"},
    )
    with urllib.request.urlopen(request, timeout=10) as response:
@ -78,6 +78,48 @@ def _collect_junit_totals(pattern: str) -> dict[str, int]:
    return totals


+def _load_junit_cases(path: str) -> list[tuple[str, str]]:
+    """Parse individual JUnit test case outcomes for flakiness panels."""
+    if not os.path.exists(path):
+        return []
+
+    tree = ET.parse(path)
+    root = tree.getroot()
+    suites: list[ET.Element]
+    if root.tag == "testsuite":
+        suites = [root]
+    elif root.tag == "testsuites":
+        suites = [elem for elem in root if elem.tag == "testsuite"]
+    else:
+        suites = []
+
+    cases: list[tuple[str, str]] = []
+    for suite in suites:
+        for case in suite.findall("testcase"):
+            name = (case.attrib.get("name") or "").strip()
+            classname = (case.attrib.get("classname") or "").strip()
+            if not name:
+                continue
+            test_id = f"{classname}::{name}" if classname else name
+            status = "passed"
+            if case.find("failure") is not None:
+                status = "failed"
+            elif case.find("error") is not None:
+                status = "error"
+            elif case.find("skipped") is not None:
+                status = "skipped"
+            cases.append((test_id, status))
+    return cases
+
+
+def _collect_junit_cases(pattern: str) -> list[tuple[str, str]]:
+    """Collect test-case statuses across all matching JUnit XML files."""
+    cases: list[tuple[str, str]] = []
+    for path in sorted(glob(pattern)):
+        cases.extend(_load_junit_cases(path))
+    return cases
+
+
 def _read_exit_code(path: str) -> int:
    """Read the quality-gate exit code, defaulting to failure if missing."""
    try:
@ -136,6 +178,7 @@ def _build_payload(
    suite: str,
    status: str,
    tests: dict[str, int],
+    test_cases: list[tuple[str, str]],
    ok_count: int,
    failed_count: int,
    branch: str,
@ -171,7 +214,12 @@ def _build_payload(
        f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {workspace_line_coverage_percent:.3f}',
        "# TYPE platform_quality_gate_source_lines_over_500_total gauge",
        f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
+        "# TYPE platform_quality_gate_test_case_result gauge",
    ]
+    lines.extend(
+        f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
+        for test_name, test_status in test_cases
+    )
    results = summary.get("results", []) if isinstance(summary, dict) else []
    if results:
        lines.append("# TYPE titan_iac_quality_gate_checks_total gauge")
@ -188,7 +236,7 @@ def _build_payload(

 def main() -> int:
    """Publish the quality-gate metrics and print a compact run summary."""
-    suite = os.getenv("SUITE_NAME", "titan-iac")
+    suite = os.getenv("SUITE_NAME", "titan_iac")
    pushgateway_url = os.getenv("PUSHGATEWAY_URL", "http://platform-quality-gateway.monitoring.svc.cluster.local:9091")
    job_name = os.getenv("QUALITY_GATE_JOB_NAME", "platform-quality-ci")
    junit_glob = os.getenv("JUNIT_GLOB", os.getenv("JUNIT_PATH", "build/junit-*.xml"))
@ -198,6 +246,7 @@ def main() -> int:
    build_number = os.getenv("BUILD_NUMBER", "")

    tests = _collect_junit_totals(junit_glob)
+    test_cases = _collect_junit_cases(junit_glob)
    exit_code = _read_exit_code(exit_code_path)
    status = "ok" if exit_code == 0 else "failed"
    summary = _load_summary(summary_path)
@ -227,6 +276,7 @@ def main() -> int:
        suite=suite,
        status=status,
        tests=tests,
+        test_cases=test_cases,
        ok_count=ok_count,
        failed_count=failed_count,
        branch=branch,
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
--- a/testing/tests/test_publish_test_metrics.py
+++ b/testing/tests/test_publish_test_metrics.py
@ -40,6 +40,27 @@ def test_collect_junit_totals_sums_multiple_files(tmp_path: Path):
    assert totals == {"tests": 5, "failures": 1, "errors": 1, "skipped": 1}


+def test_collect_junit_cases_tracks_individual_statuses(tmp_path: Path):
+    junit = tmp_path / "junit.xml"
+    junit.write_text(
+        (
+            "<testsuite>"
+            '<testcase classname="pkg.mod" name="test_ok" />'
+            '<testcase classname="pkg.mod" name="test_fail"><failure /></testcase>'
+            '<testcase classname="pkg.mod" name="test_error"><error /></testcase>'
+            '<testcase classname="pkg.mod" name="test_skip"><skipped /></testcase>'
+            "</testsuite>"
+        ),
+        encoding="utf-8",
+    )
+
+    cases = publish_test_metrics._collect_junit_cases(str(tmp_path / "junit*.xml"))
+    assert ("pkg.mod::test_ok", "passed") in cases
+    assert ("pkg.mod::test_fail", "failed") in cases
+    assert ("pkg.mod::test_error", "error") in cases
+    assert ("pkg.mod::test_skip", "skipped") in cases
+
+
 def test_parse_junit_handles_testsuites_and_invalid_counts(tmp_path: Path):
    junit_path = tmp_path / "suite.xml"
    junit_path.write_text(
@ -171,6 +192,7 @@ def test_build_payload_includes_summary_metrics():
        suite="titan-iac",
        status="ok",
        tests={"tests": 4, "failures": 1, "errors": 0, "skipped": 1},
+        test_cases=[("pkg.mod::test_ok", "passed"), ("pkg.mod::test_fail", "failed")],
        ok_count=7,
        failed_count=2,
        branch="main",
@ -190,6 +212,7 @@ def test_build_payload_includes_summary_metrics():
    assert 'titan_iac_quality_gate_checks_total{suite="titan-iac",check="unit",result="failed"} 1' in payload
    assert 'platform_quality_gate_workspace_line_coverage_percent{suite="titan-iac"} 97.125' in payload
    assert 'platform_quality_gate_source_lines_over_500_total{suite="titan-iac"} 3' in payload
+    assert 'platform_quality_gate_test_case_result{suite="titan-iac",test="pkg.mod::test_fail",status="failed"} 1' in payload


 def test_build_payload_skips_incomplete_results():
@ -197,6 +220,7 @@ def test_build_payload_skips_incomplete_results():
        suite="titan-iac",
        status="failed",
        tests={"tests": 0, "failures": 0, "errors": 0, "skipped": 0},
+        test_cases=[],
        ok_count=1,
        failed_count=2,
        branch="",