test: tighten adaptive probe evidence

2026-05-02 17:07:41 -03:00 · 2026-05-02 17:07:41 -03:00 · c874ddbf99
commit c874ddbf99
parent c82c61c652
8 changed files with 98 additions and 18 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@ -479,3 +479,19 @@ low paired-pulse counts as product failure.
 - [x] Update manual probe contract coverage for the audio-gain control.
 - [x] Run focused analyzer/manual-probe tests and package checks.
 - [x] Push clean semver `0.17.23` for installed client/server testing.
+
+## 0.17.24 Probe Truthfulness And Localization Checklist
+
+Context: the 0.17.23 run proved adaptive calibration is now live-editing the server,
+but confirmation still failed. Segment 3 passed and triggered a provisional calibration
+nudge, while the confirmation segment failed with a near-centered median but high p95/drift.
+This means the fastest high-quality path is localization tooling, not another static offset
+guess.
+
+- [x] Treat the latest failure as timing instability/outlier drift until the probe proves otherwise.
+- [x] Fix analyzer-failure raw activity delta parsing so bounded raw-delta calibration can use the evidence it prints.
+- [x] Stop marking `blind-targets.json` ready from calibration-only passes when confirmation segments exist and fail.
+- [x] Emit combined `segment-events.csv` and `segment-events.jsonl` artifacts so each run exposes per-pulse skew and confidence across segments.
+- [ ] Use the next run to decide whether bad p95 is caused by low-confidence analyzer pairings, camera/mic capture instability, or server planner/output jitter.
+- [ ] Add stage-local timing evidence for stimulus schedule, client capture onsets, server output timing, and browser/device capture if the event table still cannot isolate the source.
+- [ ] Only save calibration defaults after a confirmation segment passes.
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"

 [[package]]
 name = "lesavka_client"
-version = "0.17.23"
+version = "0.17.24"
 dependencies = [
 "anyhow",
 "async-stream",
@ -1686,7 +1686,7 @@ dependencies = [

 [[package]]
 name = "lesavka_common"
-version = "0.17.23"
+version = "0.17.24"
 dependencies = [
 "anyhow",
 "base64",
@ -1698,7 +1698,7 @@ dependencies = [

 [[package]]
 name = "lesavka_server"
-version = "0.17.23"
+version = "0.17.24"
 dependencies = [
 "anyhow",
 "base64",
--- a/client/Cargo.toml
+++ b/client/Cargo.toml
@ -4,7 +4,7 @@ path                    = "src/main.rs"

 [package]
 name                    = "lesavka_client"
-version                 = "0.17.23"
+version                 = "0.17.24"
 edition                 = "2024"

 [dependencies]
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name                    = "lesavka_common"
-version                 = "0.17.23"
+version                 = "0.17.24"
 edition                 = "2024"
 build                   = "build.rs"

--- a/scripts/manual/run_upstream_browser_av_sync.sh
+++ b/scripts/manual/run_upstream_browser_av_sync.sh
@ -260,8 +260,8 @@ if not reason:
    reason = lines[-1].strip() if lines else "analyzer failed"

 raw_match = re.search(
-    r"raw activity delta was ([+-]?[0-9]+(?:\\.[0-9]+)?) ms "
-    r"\\(video=([0-9]+(?:\\.[0-9]+)?)s audio=([0-9]+(?:\\.[0-9]+)?)s\\)",
+    r"raw activity delta was ([+-]?[0-9]+(?:\.[0-9]+)?) ms "
+    r"\(video=([0-9]+(?:\.[0-9]+)?)s audio=([0-9]+(?:\.[0-9]+)?)s\)",
    text,
 )
 paired_match = re.search(r"saw ([0-9]+)", reason)
--- a/scripts/manual/run_upstream_mirrored_av_sync.sh
+++ b/scripts/manual/run_upstream_mirrored_av_sync.sh
@ -789,6 +789,7 @@ def range_for(rows, key):


 rows = []
+event_rows = []
 for segment in range(1, segment_count + 1):
    segment_dir = root / f"segment-{segment}"
    report_path = latest_report(segment_dir)
@ -857,6 +858,21 @@ for segment in range(1, segment_count + 1):
    }
    rows.append(row)

+    for event in report.get("paired_events", []):
+        if not isinstance(event, dict):
+            continue
+        event_rows.append({
+            "segment": segment,
+            "segment_phase": phase,
+            "probe_status": row["probe_status"],
+            "probe_passed": row["probe_passed"],
+            "event_id": event.get("event_id"),
+            "video_time_s": as_float(str(event.get("video_time_s", ""))),
+            "audio_time_s": as_float(str(event.get("audio_time_s", ""))),
+            "skew_ms": as_float(str(event.get("skew_ms", ""))),
+            "confidence": as_float(str(event.get("confidence", ""))),
+        })
+
 csv_path = root / "segment-metrics.csv"
 jsonl_path = root / "segment-metrics.jsonl"
 fieldnames = list(rows[0].keys()) if rows else ["segment"]
@ -868,21 +884,50 @@ with jsonl_path.open("w", encoding="utf-8") as handle:
    for row in rows:
        handle.write(json.dumps(row, sort_keys=True) + "\n")

+events_csv_path = root / "segment-events.csv"
+events_jsonl_path = root / "segment-events.jsonl"
+event_fieldnames = list(event_rows[0].keys()) if event_rows else [
+    "segment",
+    "segment_phase",
+    "probe_status",
+    "probe_passed",
+    "event_id",
+    "video_time_s",
+    "audio_time_s",
+    "skew_ms",
+    "confidence",
+]
+with events_csv_path.open("w", newline="", encoding="utf-8") as handle:
+    writer = csv.DictWriter(handle, fieldnames=event_fieldnames)
+    writer.writeheader()
+    writer.writerows(event_rows)
+with events_jsonl_path.open("w", encoding="utf-8") as handle:
+    for row in event_rows:
+        handle.write(json.dumps(row, sort_keys=True) + "\n")
+
 good_rows = [row for row in rows if row.get("probe_passed")]
 confirmation_rows = [row for row in rows if row.get("segment_phase") == "confirmation"]
 passing_confirmation_rows = [row for row in confirmation_rows if row.get("probe_passed")]
 target_path = root / "blind-targets.json"
-if good_rows:
+target_source_rows = passing_confirmation_rows if confirmation_rows else good_rows
+if target_source_rows:
    target = {
        "ready": True,
-        "source": "probe-passing segmented mirrored run",
-        "good_segments": [row["segment"] for row in good_rows],
-        "planner_live_lag_ms_after": range_for(good_rows, "planner_live_lag_ms_after"),
-        "planner_skew_ms_after": range_for(good_rows, "planner_skew_ms_after"),
-        "active_audio_offset_us_after": range_for(good_rows, "active_audio_offset_us_after"),
-        "active_video_offset_us_after": range_for(good_rows, "active_video_offset_us_after"),
-        "probe_p95_abs_skew_ms": range_for(good_rows, "probe_p95_abs_skew_ms"),
-        "probe_median_skew_ms": range_for(good_rows, "probe_median_skew_ms"),
+        "source": (
+            "passing confirmation segment"
+            if confirmation_rows else
+            "probe-passing segmented mirrored run"
+        ),
+        "good_segments": [row["segment"] for row in target_source_rows],
+        "candidate_good_calibration_segments": [
+            row["segment"] for row in good_rows if row.get("segment_phase") != "confirmation"
+        ],
+        "planner_live_lag_ms_after": range_for(target_source_rows, "planner_live_lag_ms_after"),
+        "planner_skew_ms_after": range_for(target_source_rows, "planner_skew_ms_after"),
+        "active_audio_offset_us_after": range_for(target_source_rows, "active_audio_offset_us_after"),
+        "active_video_offset_us_after": range_for(target_source_rows, "active_video_offset_us_after"),
+        "probe_p95_abs_skew_ms": range_for(target_source_rows, "probe_p95_abs_skew_ms"),
+        "probe_median_skew_ms": range_for(target_source_rows, "probe_median_skew_ms"),
    }
 else:
    sortable = [
@ -892,8 +937,15 @@ else:
    best = min(sortable, key=lambda row: row["probe_p95_abs_skew_ms"], default=None)
    target = {
        "ready": False,
-        "reason": "no segment produced a passing probe verdict; refusing to invent blind targets",
+        "reason": (
+            "confirmation did not pass; refusing to promote calibration-only segments to blind targets"
+            if confirmation_rows else
+            "no segment produced a passing probe verdict; refusing to invent blind targets"
+        ),
        "segments_seen": len(rows),
+        "candidate_good_calibration_segments": [
+            row["segment"] for row in good_rows if row.get("segment_phase") != "confirmation"
+        ],
        "best_segment": best["segment"] if best else None,
        "best_probe_status": best["probe_status"] if best else "missing",
        "best_probe_p95_abs_skew_ms": best["probe_p95_abs_skew_ms"] if best else None,
@ -930,6 +982,8 @@ confirmation_path.write_text(json.dumps(confirmation, indent=2, sort_keys=True)

 print(f"   ↪ segment_metrics_csv={csv_path}")
 print(f"   ↪ segment_metrics_jsonl={jsonl_path}")
+print(f"   ↪ segment_events_csv={events_csv_path}")
+print(f"   ↪ segment_events_jsonl={events_jsonl_path}")
 print(f"   ↪ blind_targets_json={target_path}")
 print(f"   ↪ blind_targets_ready={str(bool(target.get('ready'))).lower()}")
 print(f"   ↪ confirmation_summary_json={confirmation_path}")
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@ -10,7 +10,7 @@ bench                   = false

 [package]
 name                    = "lesavka_server"
-version                 = "0.17.23"
+version                 = "0.17.24"
 edition                 = "2024"
 autobins                = false

--- a/testing/tests/client_manual_sync_script_contract.rs
+++ b/testing/tests/client_manual_sync_script_contract.rs
@ -88,12 +88,18 @@ fn browser_sync_script_can_delegate_to_a_real_path_driver() {
        "for attempt in 1 2 3 4 5",
        "capture fetch attempt ${attempt} failed; retrying",
        "failed to fetch browser capture from ${TETHYS_HOST}:${REMOTE_CAPTURE}",
+        r"raw activity delta was ([+-]?[0-9]+(?:\.[0-9]+)?) ms ",
+        r"\(video=([0-9]+(?:\.[0-9]+)?)s audio=([0-9]+(?:\.[0-9]+)?)s\)",
    ] {
        assert!(
            BROWSER_SYNC_SCRIPT.contains(expected),
            "browser sync script should contain {expected}"
        );
    }
+    assert!(
+        !BROWSER_SYNC_SCRIPT.contains(r"(?:\\.[0-9]+)?"),
+        "browser sync raw-delta parser should not require a literal backslash before decimals"
+    );
 }

 #[test]
@ -162,6 +168,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "calibration-decision.env",
        "segment-metrics.csv",
        "segment-metrics.jsonl",
+        "segment-events.csv",
+        "segment-events.jsonl",
        "confirmation-summary.json",
        "confirmation_passed",
        "check_confirmation_result",
@ -170,6 +178,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
        "probe_activity_start_delta_ms",
        "blind-targets.json",
        "no segment produced a passing probe verdict; refusing to invent blind targets",
+        "confirmation did not pass; refusing to promote calibration-only segments to blind targets",
+        "candidate_good_calibration_segments",
        "decision_mode",
        "decision_provisional_video_recommendation_us",
        "planner_live_lag_ms_after",