test: tighten adaptive probe evidence

This commit is contained in:
Brad Stein 2026-05-02 17:07:41 -03:00
parent c82c61c652
commit c874ddbf99
8 changed files with 98 additions and 18 deletions

View File

@ -479,3 +479,19 @@ low paired-pulse counts as product failure.
- [x] Update manual probe contract coverage for the audio-gain control.
- [x] Run focused analyzer/manual-probe tests and package checks.
- [x] Push clean semver `0.17.23` for installed client/server testing.
## 0.17.24 Probe Truthfulness And Localization Checklist
Context: the 0.17.23 run proved adaptive calibration is now live-editing the server,
but confirmation still failed. Segment 3 passed and triggered a provisional calibration
nudge, while the confirmation segment failed with a near-centered median but high p95/drift.
This means the fastest high-quality path is localization tooling, not another static offset
guess.
- [x] Treat the latest failure as timing instability/outlier drift until the probe proves otherwise.
- [x] Fix analyzer-failure raw activity delta parsing so bounded raw-delta calibration can use the evidence it prints.
- [x] Stop marking `blind-targets.json` ready from calibration-only passes when confirmation segments exist and fail.
- [x] Emit combined `segment-events.csv` and `segment-events.jsonl` artifacts so each run exposes per-pulse skew and confidence across segments.
- [ ] Use the next run to decide whether bad p95 is caused by low-confidence analyzer pairings, camera/mic capture instability, or server planner/output jitter.
- [ ] Add stage-local timing evidence for stimulus schedule, client capture onsets, server output timing, and browser/device capture if the event table still cannot isolate the source.
- [ ] Only save calibration defaults after a confirmation segment passes.

6
Cargo.lock generated
View File

@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "lesavka_client"
version = "0.17.23"
version = "0.17.24"
dependencies = [
"anyhow",
"async-stream",
@ -1686,7 +1686,7 @@ dependencies = [
[[package]]
name = "lesavka_common"
version = "0.17.23"
version = "0.17.24"
dependencies = [
"anyhow",
"base64",
@ -1698,7 +1698,7 @@ dependencies = [
[[package]]
name = "lesavka_server"
version = "0.17.23"
version = "0.17.24"
dependencies = [
"anyhow",
"base64",

View File

@ -4,7 +4,7 @@ path = "src/main.rs"
[package]
name = "lesavka_client"
version = "0.17.23"
version = "0.17.24"
edition = "2024"
[dependencies]

View File

@ -1,6 +1,6 @@
[package]
name = "lesavka_common"
version = "0.17.23"
version = "0.17.24"
edition = "2024"
build = "build.rs"

View File

@ -260,8 +260,8 @@ if not reason:
reason = lines[-1].strip() if lines else "analyzer failed"
raw_match = re.search(
r"raw activity delta was ([+-]?[0-9]+(?:\\.[0-9]+)?) ms "
r"\\(video=([0-9]+(?:\\.[0-9]+)?)s audio=([0-9]+(?:\\.[0-9]+)?)s\\)",
r"raw activity delta was ([+-]?[0-9]+(?:\.[0-9]+)?) ms "
r"\(video=([0-9]+(?:\.[0-9]+)?)s audio=([0-9]+(?:\.[0-9]+)?)s\)",
text,
)
paired_match = re.search(r"saw ([0-9]+)", reason)

View File

@ -789,6 +789,7 @@ def range_for(rows, key):
rows = []
event_rows = []
for segment in range(1, segment_count + 1):
segment_dir = root / f"segment-{segment}"
report_path = latest_report(segment_dir)
@ -857,6 +858,21 @@ for segment in range(1, segment_count + 1):
}
rows.append(row)
for event in report.get("paired_events", []):
if not isinstance(event, dict):
continue
event_rows.append({
"segment": segment,
"segment_phase": phase,
"probe_status": row["probe_status"],
"probe_passed": row["probe_passed"],
"event_id": event.get("event_id"),
"video_time_s": as_float(str(event.get("video_time_s", ""))),
"audio_time_s": as_float(str(event.get("audio_time_s", ""))),
"skew_ms": as_float(str(event.get("skew_ms", ""))),
"confidence": as_float(str(event.get("confidence", ""))),
})
csv_path = root / "segment-metrics.csv"
jsonl_path = root / "segment-metrics.jsonl"
fieldnames = list(rows[0].keys()) if rows else ["segment"]
@ -868,21 +884,50 @@ with jsonl_path.open("w", encoding="utf-8") as handle:
for row in rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
events_csv_path = root / "segment-events.csv"
events_jsonl_path = root / "segment-events.jsonl"
event_fieldnames = list(event_rows[0].keys()) if event_rows else [
"segment",
"segment_phase",
"probe_status",
"probe_passed",
"event_id",
"video_time_s",
"audio_time_s",
"skew_ms",
"confidence",
]
with events_csv_path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=event_fieldnames)
writer.writeheader()
writer.writerows(event_rows)
with events_jsonl_path.open("w", encoding="utf-8") as handle:
for row in event_rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
good_rows = [row for row in rows if row.get("probe_passed")]
confirmation_rows = [row for row in rows if row.get("segment_phase") == "confirmation"]
passing_confirmation_rows = [row for row in confirmation_rows if row.get("probe_passed")]
target_path = root / "blind-targets.json"
if good_rows:
target_source_rows = passing_confirmation_rows if confirmation_rows else good_rows
if target_source_rows:
target = {
"ready": True,
"source": "probe-passing segmented mirrored run",
"good_segments": [row["segment"] for row in good_rows],
"planner_live_lag_ms_after": range_for(good_rows, "planner_live_lag_ms_after"),
"planner_skew_ms_after": range_for(good_rows, "planner_skew_ms_after"),
"active_audio_offset_us_after": range_for(good_rows, "active_audio_offset_us_after"),
"active_video_offset_us_after": range_for(good_rows, "active_video_offset_us_after"),
"probe_p95_abs_skew_ms": range_for(good_rows, "probe_p95_abs_skew_ms"),
"probe_median_skew_ms": range_for(good_rows, "probe_median_skew_ms"),
"source": (
"passing confirmation segment"
if confirmation_rows else
"probe-passing segmented mirrored run"
),
"good_segments": [row["segment"] for row in target_source_rows],
"candidate_good_calibration_segments": [
row["segment"] for row in good_rows if row.get("segment_phase") != "confirmation"
],
"planner_live_lag_ms_after": range_for(target_source_rows, "planner_live_lag_ms_after"),
"planner_skew_ms_after": range_for(target_source_rows, "planner_skew_ms_after"),
"active_audio_offset_us_after": range_for(target_source_rows, "active_audio_offset_us_after"),
"active_video_offset_us_after": range_for(target_source_rows, "active_video_offset_us_after"),
"probe_p95_abs_skew_ms": range_for(target_source_rows, "probe_p95_abs_skew_ms"),
"probe_median_skew_ms": range_for(target_source_rows, "probe_median_skew_ms"),
}
else:
sortable = [
@ -892,8 +937,15 @@ else:
best = min(sortable, key=lambda row: row["probe_p95_abs_skew_ms"], default=None)
target = {
"ready": False,
"reason": "no segment produced a passing probe verdict; refusing to invent blind targets",
"reason": (
"confirmation did not pass; refusing to promote calibration-only segments to blind targets"
if confirmation_rows else
"no segment produced a passing probe verdict; refusing to invent blind targets"
),
"segments_seen": len(rows),
"candidate_good_calibration_segments": [
row["segment"] for row in good_rows if row.get("segment_phase") != "confirmation"
],
"best_segment": best["segment"] if best else None,
"best_probe_status": best["probe_status"] if best else "missing",
"best_probe_p95_abs_skew_ms": best["probe_p95_abs_skew_ms"] if best else None,
@ -930,6 +982,8 @@ confirmation_path.write_text(json.dumps(confirmation, indent=2, sort_keys=True)
print(f" ↪ segment_metrics_csv={csv_path}")
print(f" ↪ segment_metrics_jsonl={jsonl_path}")
print(f" ↪ segment_events_csv={events_csv_path}")
print(f" ↪ segment_events_jsonl={events_jsonl_path}")
print(f" ↪ blind_targets_json={target_path}")
print(f" ↪ blind_targets_ready={str(bool(target.get('ready'))).lower()}")
print(f" ↪ confirmation_summary_json={confirmation_path}")

View File

@ -10,7 +10,7 @@ bench = false
[package]
name = "lesavka_server"
version = "0.17.23"
version = "0.17.24"
edition = "2024"
autobins = false

View File

@ -88,12 +88,18 @@ fn browser_sync_script_can_delegate_to_a_real_path_driver() {
"for attempt in 1 2 3 4 5",
"capture fetch attempt ${attempt} failed; retrying",
"failed to fetch browser capture from ${TETHYS_HOST}:${REMOTE_CAPTURE}",
r"raw activity delta was ([+-]?[0-9]+(?:\.[0-9]+)?) ms ",
r"\(video=([0-9]+(?:\.[0-9]+)?)s audio=([0-9]+(?:\.[0-9]+)?)s\)",
] {
assert!(
BROWSER_SYNC_SCRIPT.contains(expected),
"browser sync script should contain {expected}"
);
}
assert!(
!BROWSER_SYNC_SCRIPT.contains(r"(?:\\.[0-9]+)?"),
"browser sync raw-delta parser should not require a literal backslash before decimals"
);
}
#[test]
@ -162,6 +168,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
"calibration-decision.env",
"segment-metrics.csv",
"segment-metrics.jsonl",
"segment-events.csv",
"segment-events.jsonl",
"confirmation-summary.json",
"confirmation_passed",
"check_confirmation_result",
@ -170,6 +178,8 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
"probe_activity_start_delta_ms",
"blind-targets.json",
"no segment produced a passing probe verdict; refusing to invent blind targets",
"confirmation did not pass; refusing to promote calibration-only segments to blind targets",
"candidate_good_calibration_segments",
"decision_mode",
"decision_provisional_video_recommendation_us",
"planner_live_lag_ms_after",