lesavka/scripts/manual/client_rct_transport_layers.py

"""Layer attribution helpers for client-to-RCT transport summaries."""

from __future__ import annotations

import json
import math
import pathlib


def percentile(values: list[float], q: float) -> float | None:
    """Return a nearest-rank percentile for finite values.

    Inputs: numeric samples and a quantile in `[0, 1]`. Outputs: the selected
    percentile or `None`. Why: stage attribution should use the same p95 style
    as the main transport summary without coupling this helper back to the CLI.
    """

    finite = sorted(value for value in values if math.isfinite(value))
    if not finite:
        return None
    index = min(len(finite) - 1, max(0, math.ceil(len(finite) * q) - 1))
    return finite[index]


def client_send_summary(report_path: pathlib.Path, joined: list[dict]) -> dict | None:
    """Summarize client-side bundle send timing from optional JSONL artifacts.

    Inputs: the final report path, used to find sibling
    `client-send-bundles.jsonl`, and joined RCT events. Outputs: local queue
    age plus post-send-to-RCT age percentiles. Why: if final freshness fails,
    the next question is whether delay existed before the client wrote the gRPC
    bundle or appeared after the bundle left the client.
    """

    send_path = report_path.parent / "client-send-bundles.jsonl"
    if not send_path.exists():
        return None

    rows_by_video_pts: dict[int, dict] = {}
    local_ages: list[float] = []
    for line in send_path.read_text(errors="replace").splitlines():
        try:
            row = json.loads(line)
        except json.JSONDecodeError:
            continue
        if row.get("schema") != "lesavka.sync-probe-send.v1":
            continue
        try:
            video_pts = int(row["video_capture_pts_us"])
            local_age = float(row["local_age_ms"])
        except (KeyError, TypeError, ValueError):
            continue
        rows_by_video_pts[video_pts] = row
        local_ages.append(local_age)

    if not rows_by_video_pts:
        return None

    post_send_video_ages: list[float] = []
    post_send_audio_ages: list[float] = []
    joined_with_send_rows = 0
    for event in joined:
        planned_start_us = event.get("client_planned_start_us")
        if planned_start_us is None:
            continue
        row = rows_by_video_pts.get(int(planned_start_us))
        if not row:
            continue
        joined_with_send_rows += 1
        local_age = float(row.get("local_age_ms") or 0.0)
        if event.get("video_age_ms") is not None:
            post_send_video_ages.append(float(event["video_age_ms"]) - local_age)
        if event.get("audio_age_ms") is not None:
            post_send_audio_ages.append(float(event["audio_age_ms"]) - local_age)

    post_send_worst = max(
        value
        for value in [
            percentile(post_send_video_ages, 0.95),
            percentile(post_send_audio_ages, 0.95),
        ]
        if value is not None
    ) if post_send_video_ages or post_send_audio_ages else None

    return {
        "bundle_count": len(rows_by_video_pts),
        "joined_event_count": joined_with_send_rows,
        "local_bundle_age_p95_ms": percentile(local_ages, 0.95),
        "local_bundle_age_max_ms": max(local_ages) if local_ages else None,
        "post_client_send_video_age_p95_ms": percentile(post_send_video_ages, 0.95),
        "post_client_send_audio_age_p95_ms": percentile(post_send_audio_ages, 0.95),
        "post_client_send_worst_p95_ms": post_send_worst,
    }


def freshness_bottleneck(summary: dict) -> str:
    """Classify the most likely freshness bottleneck from available artifacts.

    Inputs: the structured summary assembled from RCT capture, client send log,
    and optional upstream-sync samples. Outputs: a short machine-readable label.
    Why: first-pass client->server work should remain black-box, but failed runs
    should still point us toward client queueing, transport/server receive, or
    post-send output/RCT delay before we add invasive introspection.
    """

    if summary.get("freshness_passed"):
        return "within_limit"
    if summary.get("paired_event_count", 0) < summary.get("min_paired_events", 0):
        return "evidence_incomplete"

    client_send = summary.get("client_send") or {}
    local_age = client_send.get("local_bundle_age_p95_ms")
    post_send = client_send.get("post_client_send_worst_p95_ms")
    if local_age is not None and local_age > 500.0:
        return "client_queue_or_bundle_generation"

    upstream = summary.get("upstream_sync") or {}
    receive_age = upstream.get("server_receive_age_p95_ms")
    transport_lag = upstream.get("media_transport_lag_p95_ms")
    if receive_age is not None and receive_age > 500.0:
        return "server_receive_or_ingress_queue"
    if transport_lag is not None and transport_lag > 1_000.0:
        return "client_to_server_transport"
    if post_send is not None and post_send > 500.0:
        return "post_client_send_to_rct_path"
    return "needs_deeper_introspection"