fix: refuse raw-only probe calibration by default

2026-05-02 20:05:24 -03:00 · 2026-05-02 20:05:24 -03:00 · 5634e7197d
commit 5634e7197d
parent 0188c8661b
7 changed files with 74 additions and 10 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@ -551,3 +551,33 @@ stayed empty, and client timing skew included a false cross-pipeline PTS offset.
 - [x] Add tests proving sink handoff survives large offset-compensated local PTS gaps.
 - [x] Add tests proving audio/video timing metadata no longer copies packet PTS domains into blind sidecar fields.
 - [ ] Next mirrored run should show non-zero `planner_sink_handoff_window_samples` and much smaller client send/capture p95 skew before trusting blind healing.
 ## 0.17.29 Enqueue-Bound Client Timing Checklist
 Context: the first blind-healing runs showed huge client capture/send skew even though media packets
 were latest-only. The sidecar timestamps were being written in async sender tasks after queueing, so
 parallel scheduling delay leaked into the diagnostic clock and made blind healing distrust the wrong
 layer.
 - [x] Stamp client timing metadata at the capture/enqueue boundary instead of the async gRPC send boundary.
 - [x] Keep async sender updates limited to queue depth and queue age so scheduling delay stays observable but does not rewrite capture/send time.
 - [x] Pair server-side client timing samples by nearby enqueue/send time before reporting rolling skew windows.
 - [x] Add regression tests proving queue delay no longer changes capture/send timestamps.
 - [x] Push clean semver `0.17.29` for installed client/server testing.
 - [x] Use the next mirrored run to confirm client capture/send p95 drops from seconds to single-digit milliseconds.
 ## 0.17.30 Raw-Failure Calibration Safety Checklist
 Context: the 0.17.29 mirrored run confirmed the client-side scheduling leak is fixed, but the probe
 then applied large opposite calibration nudges from analyzer failures with zero or one coded pair.
 Raw activity deltas are useful diagnostic breadcrumbs; they are not safe steering evidence when coded
 pairing collapses.
 - [x] Treat the 0.17.29 run as proof that client sidecar timing is now trustworthy enough to move the investigation downstream.
 - [x] Default raw analyzer-failure calibration to off instead of inheriting provisional calibration.
 - [x] Add `LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS` so even explicit raw-failure calibration refuses weak coded evidence.
 - [x] Print the raw-failure pair floor in calibration decisions and segment artifacts.
 - [x] Prefer server-side receive/sink blockers over probe-pairing blockers when root-cause evidence is available.
 - [x] Update manual probe contract coverage for the safer defaults and refusal reason.
 - [ ] Re-run the probe-calibrate-confirm flow; analyzer failures should diagnose but not mutate calibration unless raw fallback is explicitly enabled and has enough coded support.
 - [ ] If client send/capture p95 stays low and server receive p95 stays high, localize the transport/server-receive timing layer next.
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
 [[package]]
 name = "lesavka_client"
-version = "0.17.29"
+version = "0.17.30"
 dependencies = [
 "anyhow",
 "async-stream",
@ -1686,7 +1686,7 @@ dependencies = [
 [[package]]
 name = "lesavka_common"
-version = "0.17.29"
+version = "0.17.30"
 dependencies = [
 "anyhow",
 "base64",
@ -1698,7 +1698,7 @@ dependencies = [
 [[package]]
 name = "lesavka_server"
-version = "0.17.29"
+version = "0.17.30"
 dependencies = [
 "anyhow",
 "base64",
--- a/client/Cargo.toml
+++ b/client/Cargo.toml
@ -4,7 +4,7 @@ path                    = "src/main.rs"
 [package]
 name                    = "lesavka_client"
-version                 = "0.17.29"
+version                 = "0.17.30"
 edition                 = "2024"
 [dependencies]
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name                    = "lesavka_common"
-version                 = "0.17.29"
+version                 = "0.17.30"
 edition                 = "2024"
 build                   = "build.rs"
--- a/scripts/manual/run_upstream_mirrored_av_sync.sh
+++ b/scripts/manual/run_upstream_mirrored_av_sync.sh
@ -39,7 +39,8 @@ LESAVKA_SYNC_PROVISIONAL_MAX_P95_MS=${LESAVKA_SYNC_PROVISIONAL_MAX_P95_MS:-350}
 LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS=${LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS:-250}
 LESAVKA_SYNC_PROVISIONAL_GAIN=${LESAVKA_SYNC_PROVISIONAL_GAIN:-0.5}
 LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US=${LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US:-150000}
-LESAVKA_SYNC_RAW_FAILURE_CALIBRATION=${LESAVKA_SYNC_RAW_FAILURE_CALIBRATION:-${LESAVKA_SYNC_PROVISIONAL_CALIBRATION}}
+LESAVKA_SYNC_RAW_FAILURE_CALIBRATION=${LESAVKA_SYNC_RAW_FAILURE_CALIBRATION:-0}
 LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS=${LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS:-3}
 LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS=${LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS:-350}
 LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION=${LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION:-${LESAVKA_SYNC_ADAPTIVE_CALIBRATION}}
 LESAVKA_SYNC_CONFIRMATION_SEGMENTS=${LESAVKA_SYNC_CONFIRMATION_SEGMENTS:-1}
@ -88,6 +89,7 @@ export LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS
 export LESAVKA_SYNC_PROVISIONAL_GAIN
 export LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US
 export LESAVKA_SYNC_RAW_FAILURE_CALIBRATION
 export LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS
 export LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS
 export LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION
 export LESAVKA_SYNC_CONFIRMATION_SEGMENTS
@ -400,6 +402,7 @@ provisional_max_drift_ms = env_float("LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS", 25
 provisional_gain = env_float("LESAVKA_SYNC_PROVISIONAL_GAIN", 0.5)
 provisional_max_step_us = env_int("LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US", 150000)
 raw_failure_enabled = env_bool("LESAVKA_SYNC_RAW_FAILURE_CALIBRATION", False)
 raw_failure_min_pairs = env_int("LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS", 3)
 raw_failure_max_abs_delta_ms = env_float("LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS", 350.0)
 ready_audio_recommendation = int(cal.get("recommended_audio_offset_adjust_us") or 0)
@ -432,6 +435,11 @@ decision_note = "analyzer marked this report calibration-ready" if ready else "a
 if not report:
    if not raw_failure_enabled:
        decision_note = "raw analyzer-failure calibration disabled"
    elif paired_pulses < raw_failure_min_pairs:
        decision_note = (
            "raw analyzer-failure calibration refused: "
            f"paired_pulses {paired_pulses} < {raw_failure_min_pairs}"
        )
    elif raw_activity_delta_ms is None:
        decision_note = "raw analyzer-failure calibration refused: no raw activity delta was reported"
    elif abs(raw_activity_delta_ms) > raw_failure_max_abs_delta_ms:
@ -509,6 +517,7 @@ fields = {
    "provisional_gain": f"{provisional_gain:.3f}",
    "provisional_max_step_us": provisional_max_step_us,
    "raw_failure_calibration_enabled": str(raw_failure_enabled).lower(),
    "raw_failure_min_pairs": raw_failure_min_pairs,
    "raw_failure_max_abs_delta_ms": f"{raw_failure_max_abs_delta_ms:.1f}",
    "raw_activity_delta_ms": "" if raw_activity_delta_ms is None else f"{raw_activity_delta_ms:+.1f}",
    "analysis_failure_reason": failure_reason,
@ -552,6 +561,7 @@ PY
  echo "   ↪ provisional_gain=${provisional_gain}"
  echo "   ↪ provisional_max_step_us=${provisional_max_step_us}"
  echo "   ↪ raw_failure_calibration_enabled=${raw_failure_calibration_enabled}"
  echo "   ↪ raw_failure_min_pairs=${raw_failure_min_pairs}"
  echo "   ↪ raw_failure_max_abs_delta_ms=${raw_failure_max_abs_delta_ms}"
  [[ -n "${raw_activity_delta_ms}" ]] && echo "   ↪ raw_activity_delta_ms=${raw_activity_delta_ms}"
  [[ -n "${analysis_failure_reason}" ]] && echo "   ↪ analysis_failure_reason=${analysis_failure_reason}"
@ -905,7 +915,7 @@ def diagnose_segment(row):
            "blocker",
            "server_receive_skew_p95_high",
            "Audio/video timing becomes unstable between client send and server receive.",
-            "Treat this as network/gRPC receive jitter; heal freshness with drop/reanchor policy, not static calibration.",
+            "Treat this as transport/server receive jitter; heal freshness with drop/reanchor policy, not static calibration.",
        )
    if over(row, "planner_camera_sink_late_p95_ms_after", 120) or over(row, "planner_microphone_sink_late_p95_ms_after", 120):
        add_finding(
@ -981,7 +991,26 @@ def diagnose_segment(row):
 def primary_finding(findings):
    severity_rank = {"blocker": 0, "warning": 1, "info": 2}
-    return sorted(findings, key=lambda item: severity_rank.get(item.get("severity"), 9))[0]
+    layer_rank = {
        "client_uplink": 0,
        "network_receive": 1,
        "server_sink_scheduler": 2,
        "server_sink_handoff": 3,
        "server_evidence": 4,
        "server_calibration": 5,
        "external_boundary": 6,
        "probe_video": 7,
        "probe_pairing": 8,
        "unknown": 9,
        "none": 10,
    }
    return sorted(
        findings,
        key=lambda item: (
            severity_rank.get(item.get("severity"), 9),
            layer_rank.get(item.get("layer"), 9),
        ),
    )[0]
 rows = []
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@ -10,7 +10,7 @@ bench                   = false
 [package]
 name                    = "lesavka_server"
-version                 = "0.17.29"
+version                 = "0.17.30"
 edition                 = "2024"
 autobins                = false
--- a/testing/tests/client_manual_sync_script_contract.rs
+++ b/testing/tests/client_manual_sync_script_contract.rs
@ -140,7 +140,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS=${LESAVKA_SYNC_PROVISIONAL_MAX_DRIFT_MS:-250}",
        "LESAVKA_SYNC_PROVISIONAL_GAIN=${LESAVKA_SYNC_PROVISIONAL_GAIN:-0.5}",
        "LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US=${LESAVKA_SYNC_PROVISIONAL_MAX_STEP_US:-150000}",
-        "LESAVKA_SYNC_RAW_FAILURE_CALIBRATION=${LESAVKA_SYNC_RAW_FAILURE_CALIBRATION:-${LESAVKA_SYNC_PROVISIONAL_CALIBRATION}}",
+        "LESAVKA_SYNC_RAW_FAILURE_CALIBRATION=${LESAVKA_SYNC_RAW_FAILURE_CALIBRATION:-0}",
        "LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS=${LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS:-3}",
        "LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS=${LESAVKA_SYNC_RAW_FAILURE_MAX_ABS_DELTA_MS:-350}",
        "LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION=${LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION:-${LESAVKA_SYNC_ADAPTIVE_CALIBRATION}}",
        "LESAVKA_SYNC_CONFIRMATION_SEGMENTS=${LESAVKA_SYNC_CONFIRMATION_SEGMENTS:-1}",
@ -149,6 +150,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "LESAVKA_SYNC_TOTAL_SEGMENTS=$((LESAVKA_SYNC_CALIBRATION_SEGMENTS + LESAVKA_SYNC_CONFIRMATION_SEGMENTS))",
        "export LESAVKA_SYNC_PROVISIONAL_CALIBRATION",
        "export LESAVKA_SYNC_RAW_FAILURE_CALIBRATION",
        "export LESAVKA_SYNC_RAW_FAILURE_MIN_PAIRS",
        "LESAVKA_SYNC_ADAPTIVE_CALIBRATION",
        "LESAVKA_SYNC_CALIBRATION_SEGMENTS=4",
        "browser_consumer_reuse_session=${reuse_browser_session}",
@ -184,6 +186,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "decision_provisional_video_recommendation_us",
        "planner_live_lag_ms_after",
        "probe_p95_abs_skew_ms",
        "transport/server receive jitter",
        "settling ${LESAVKA_SYNC_SEGMENT_SETTLE_SECONDS}s before next segment",
        "print_upstream_calibration_state \"before mirrored run\"",
        "maybe_apply_probe_calibration",
@ -192,6 +195,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "bounded provisional correction from median skew",
        "bounded provisional correction from analyzer-failure raw activity",
        "raw_failure_calibration_enabled",
        "raw analyzer-failure calibration refused: ",
        "raw_failure_min_pairs",
        "provisional calibration not saved",
        "calibration apply refused: ${calibration_decision_note}",
        "calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"",