lesavka/scripts/manual/client_rct_transport_layers.py

129 lines
4.9 KiB
Python
Executable File

"""Layer attribution helpers for client-to-RCT transport summaries."""
from __future__ import annotations
import json
import math
import pathlib
def percentile(values: list[float], q: float) -> float | None:
"""Return a nearest-rank percentile for finite values.
Inputs: numeric samples and a quantile in `[0, 1]`. Outputs: the selected
percentile or `None`. Why: stage attribution should use the same p95 style
as the main transport summary without coupling this helper back to the CLI.
"""
finite = sorted(value for value in values if math.isfinite(value))
if not finite:
return None
index = min(len(finite) - 1, max(0, math.ceil(len(finite) * q) - 1))
return finite[index]
def client_send_summary(report_path: pathlib.Path, joined: list[dict]) -> dict | None:
"""Summarize client-side bundle send timing from optional JSONL artifacts.
Inputs: the final report path, used to find sibling
`client-send-bundles.jsonl`, and joined RCT events. Outputs: local queue
age plus post-send-to-RCT age percentiles. Why: if final freshness fails,
the next question is whether delay existed before the client wrote the gRPC
bundle or appeared after the bundle left the client.
"""
send_path = report_path.parent / "client-send-bundles.jsonl"
if not send_path.exists():
return None
rows_by_video_pts: dict[int, dict] = {}
local_ages: list[float] = []
for line in send_path.read_text(errors="replace").splitlines():
try:
row = json.loads(line)
except json.JSONDecodeError:
continue
if row.get("schema") != "lesavka.sync-probe-send.v1":
continue
try:
video_pts = int(row["video_capture_pts_us"])
local_age = float(row["local_age_ms"])
except (KeyError, TypeError, ValueError):
continue
rows_by_video_pts[video_pts] = row
local_ages.append(local_age)
if not rows_by_video_pts:
return None
post_send_video_ages: list[float] = []
post_send_audio_ages: list[float] = []
joined_with_send_rows = 0
for event in joined:
planned_start_us = event.get("client_planned_start_us")
if planned_start_us is None:
continue
row = rows_by_video_pts.get(int(planned_start_us))
if not row:
continue
joined_with_send_rows += 1
local_age = float(row.get("local_age_ms") or 0.0)
if event.get("video_age_ms") is not None:
post_send_video_ages.append(float(event["video_age_ms"]) - local_age)
if event.get("audio_age_ms") is not None:
post_send_audio_ages.append(float(event["audio_age_ms"]) - local_age)
post_send_worst = max(
value
for value in [
percentile(post_send_video_ages, 0.95),
percentile(post_send_audio_ages, 0.95),
]
if value is not None
) if post_send_video_ages or post_send_audio_ages else None
return {
"bundle_count": len(rows_by_video_pts),
"joined_event_count": joined_with_send_rows,
"local_bundle_age_p95_ms": percentile(local_ages, 0.95),
"local_bundle_age_max_ms": max(local_ages) if local_ages else None,
"post_client_send_video_age_p95_ms": percentile(post_send_video_ages, 0.95),
"post_client_send_audio_age_p95_ms": percentile(post_send_audio_ages, 0.95),
"post_client_send_worst_p95_ms": post_send_worst,
}
def freshness_bottleneck(summary: dict) -> str:
"""Classify the most likely freshness bottleneck from available artifacts.
Inputs: the structured summary assembled from RCT capture, client send log,
and optional upstream-sync samples. Outputs: a short machine-readable label.
Why: first-pass client->server work should remain black-box, but failed runs
should still point us toward client queueing, transport/server receive, or
post-send output/RCT delay before we add invasive introspection.
"""
if summary.get("freshness_passed"):
return "within_limit"
if summary.get("paired_event_count", 0) < summary.get("min_paired_events", 0):
return "evidence_incomplete"
client_send = summary.get("client_send") or {}
local_age = client_send.get("local_bundle_age_p95_ms")
post_send = client_send.get("post_client_send_worst_p95_ms")
if local_age is not None and local_age > 500.0:
return "client_queue_or_bundle_generation"
upstream = summary.get("upstream_sync") or {}
receive_age = upstream.get("server_receive_age_p95_ms")
transport_lag = upstream.get("media_transport_lag_p95_ms")
if receive_age is not None and receive_age > 500.0:
return "server_receive_or_ingress_queue"
if transport_lag is not None and transport_lag > 1_000.0:
return "client_to_server_transport"
if post_send is not None and post_send > 500.0:
return "post_client_send_to_rct_path"
return "needs_deeper_introspection"