Breakout Env Fidelity Fix v3

objective

{
  "name": "Breakout env fidelity fix v3",
  "policy": "breakout_policy_v3",
  "go_episodes": "breakout/runs-env-fidelity-fix-v3/episodes.jsonl",
  "atari_trace": "breakout/calibration/traces/env_fidelity_fix_v3_atari.jsonl"
}

env_changes

[
  {
    "gap": "paddle/action dynamics mismatch",
    "change": "Widen effective paddle hit window by lowering paddle_hit_inset from 7.5 to 4.",
    "evidence": "Local sweep increased mean paddle contacts and raised first-10 Go score mean from 7.4 to 9.0."
  },
  {
    "gap": "ball speed and collision rhythm mismatch",
    "change": "Lower paddle_bounce_y from 10.0 to 3.75.",
    "evidence": "Local sweep reduced post-paddle vertical speed and extended mean Go episode length from 492.2 to 889.8 steps."
  },
  {
    "gap": "brick-state / score rhythm mismatch",
    "change": "Raise brick_cooldown from 12 to 90.",
    "evidence": "With the paddle/bounce fix, higher cooldown kept first-10 Go score mean at 9.0 instead of overshooting into the 12-20 range."
  }
]

rejected_change

{
  "gap": "episode horizon / life-reset termination mismatch",
  "candidate": "Continue episodes after lives are exhausted.",
  "reason": "It aligned the 1800-step horizon but inflated Go scores above 18 on the first 10 seeds, worsening score rhythm. The terminal semantics gap remains open for a targeted reset/life study."
}

comparison

{
  "baseline_go_20_seed": {
    "distribution": {
      "count": 20,
      "scores": [
        8.0,
        7.0,
        6.0,
        7.0,
        7.0,
        8.0,
        7.0,
        9.0,
        8.0,
        7.0,
        9.0,
        7.0,
        7.0,
        6.0,
        7.0,
        7.0,
        7.0,
        6.0,
        7.0,
        7.0
      ],
      "mean": 7.2,
      "median": 7.0,
      "min": 6.0,
      "max": 9.0,
      "stdev": 0.812403840463596,
      "low_score_threshold": 3.0,
      "low_score_count": 0
    },
    "rows": [
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 0,
        "episode": 0,
        "score": 8,
        "steps": 505,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep000_seed0.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 1,
        "episode": 1,
        "score": 7,
        "steps": 492,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep001_seed1.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 2,
        "episode": 2,
        "score": 6,
        "steps": 467,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep002_seed2.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 3,
        "episode": 3,
        "score": 7,
        "steps": 483,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep003_seed3.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 4,
        "episode": 4,
        "score": 7,
        "steps": 482,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep004_seed4.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 5,
        "episode": 5,
        "score": 8,
        "steps": 505,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep005_seed5.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 6,
        "episode": 6,
        "score": 7,
        "steps": 485,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep006_seed6.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 7,
        "episode": 7,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep007_seed7.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 8,
        "episode": 8,
        "score": 8,
        "steps": 499,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep008_seed8.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 9,
        "episode": 9,
        "score": 7,
        "steps": 488,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep009_seed9.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 10,
        "episode": 10,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep010_seed10.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 11,
        "episode": 11,
        "score": 7,
        "steps": 483,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep011_seed11.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 12,
        "episode": 12,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep012_seed12.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 13,
        "episode": 13,
        "score": 6,
        "steps": 465,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep013_seed13.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 14,
        "episode": 14,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep014_seed14.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 15,
        "episode": 15,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep015_seed15.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 16,
        "episode": 16,
        "score": 7,
        "steps": 477,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep016_seed16.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 17,
        "episode": 17,
        "score": 6,
        "steps": 468,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep017_seed17.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 18,
        "episode": 18,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep018_seed18.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 19,
        "episode": 19,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep019_seed19.jsonl"
      }
    ],
    "low_score_seeds": []
  },
  "after_go": {
    "distribution": {
      "count": 10,
      "scores": [
        7.0,
        6.0,
        15.0,
        13.0,
        9.0,
        7.0,
        9.0,
        8.0,
        8.0,
        8.0
      ],
      "mean": 9.0,
      "median": 8.0,
      "min": 6.0,
      "max": 15.0,
      "stdev": 2.6832815729997477,
      "low_score_count_lt3": 0
    },
    "steps_mean": 889.8,
    "done_count": 10,
    "event_means": {
      "life_lost": 5.0,
      "paddle": 4.0,
      "brick": 9.0,
      "wall": 13.9
    },
    "rows": [
      {
        "seed": 0,
        "score": 7,
        "steps": 688,
        "done": true,
        "life_lost": 5,
        "paddle": 2,
        "brick": 7,
        "wall": 13
      },
      {
        "seed": 1,
        "score": 6,
        "steps": 575,
        "done": true,
        "life_lost": 5,
        "paddle": 1,
        "brick": 6,
        "wall": 11
      },
      {
        "seed": 2,
        "score": 15,
        "steps": 1481,
        "done": true,
        "life_lost": 5,
        "paddle": 10,
        "brick": 15,
        "wall": 19
      },
      {
        "seed": 3,
        "score": 13,
        "steps": 1248,
        "done": true,
        "life_lost": 5,
        "paddle": 8,
        "brick": 13,
        "wall": 16
      },
      {
        "seed": 4,
        "score": 9,
        "steps": 924,
        "done": true,
        "life_lost": 5,
        "paddle": 4,
        "brick": 9,
        "wall": 13
      },
      {
        "seed": 5,
        "score": 7,
        "steps": 688,
        "done": true,
        "life_lost": 5,
        "paddle": 2,
        "brick": 7,
        "wall": 13
      },
      {
        "seed": 6,
        "score": 9,
        "steps": 866,
        "done": true,
        "life_lost": 5,
        "paddle": 4,
        "brick": 9,
        "wall": 14
      },
      {
        "seed": 7,
        "score": 8,
        "steps": 809,
        "done": true,
        "life_lost": 5,
        "paddle": 3,
        "brick": 8,
        "wall": 14
      },
      {
        "seed": 8,
        "score": 8,
        "steps": 810,
        "done": true,
        "life_lost": 5,
        "paddle": 3,
        "brick": 8,
        "wall": 14
      },
      {
        "seed": 9,
        "score": 8,
        "steps": 809,
        "done": true,
        "life_lost": 5,
        "paddle": 3,
        "brick": 8,
        "wall": 12
      }
    ]
  },
  "atari": {
    "distribution": {
      "count": 10,
      "scores": [
        2.0,
        13.0,
        13.0,
        13.0,
        0.0,
        13.0,
        2.0,
        13.0,
        2.0,
        0.0
      ],
      "mean": 7.1,
      "median": 7.5,
      "min": 0.0,
      "max": 13.0,
      "stdev": 5.940538696111658,
      "low_score_count_lt3": 5
    },
    "steps_mean": 1800.0,
    "done_count": 0,
    "rows": [
      {
        "episode": 0,
        "seed": 0,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "reward_count": 2,
        "first_reward_step": 32,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 1,
        "seed": 1,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 2,
        "seed": 2,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 3,
        "seed": 3,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 4,
        "seed": 4,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "reward_count": 0,
        "first_reward_step": null,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 5,
        "seed": 5,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 6,
        "seed": 6,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "reward_count": 2,
        "first_reward_step": 32,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 7,
        "seed": 7,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 8,
        "seed": 8,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "reward_count": 2,
        "first_reward_step": 32,
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 9,
        "seed": 9,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "reward_count": 0,
        "first_reward_step": null,
        "life_ram_start": 5,
        "life_ram_end": 4
      }
    ]
  },
  "first10_before_after": {
    "baseline_go_scores": [
      8.0,
      7.0,
      6.0,
      7.0,
      7.0,
      8.0,
      7.0,
      9.0,
      8.0,
      7.0
    ],
    "baseline_go_mean_first10": 7.4,
    "baseline_atari_scores": [
      2.0,
      13.0,
      13.0,
      13.0,
      0.0,
      13.0,
      2.0,
      13.0,
      2.0,
      0.0
    ],
    "baseline_atari_mean_first10": 7.1,
    "after_go_scores": [
      7.0,
      6.0,
      15.0,
      13.0,
      9.0,
      7.0,
      9.0,
      8.0,
      8.0,
      8.0
    ],
    "after_go_mean": 9.0,
    "atari_scores": [
      2.0,
      13.0,
      13.0,
      13.0,
      0.0,
      13.0,
      2.0,
      13.0,
      2.0,
      0.0
    ],
    "atari_mean": 7.1
  },
  "seed_score_correlation": {
    "baseline_first10_pearson": 0.033666980425682384,
    "baseline_first10_spearman": 0.07829231071388786,
    "after_first10_pearson": 0.28857968451109245,
    "after_first10_spearman": -0.05026881914166648,
    "interpretation": "Seed-level score correlation is a proxy, not candidate-rank correlation. Candidate-rank correlation must be remeasured in the next Atari-gated search goal."
  }
}

decision

{
  "modified_go_env": true,
  "handled_gap_count": 3,
  "remaining_primary_risk": "Episode horizon/life-reset semantics still differ; Go episodes still terminate before the Atari 1800-step horizon on many seeds.",
  "rank_correlation_status": "Seed-level correlation is reported as a proxy; candidate-rank correlation is not re-estimated because this goal validates one frozen policy, not a candidate set.",
  "next_recommendation": "Run Atari-gated search on this env only after WSL validation; if Atari transfer worsens, revert or isolate the paddle/bounce/cooldown changes independently."
}

evidence

{
  "r2_video_url": "https://file.aimusic.win/gemma/breakout/env_fidelity_fix_v3_atari_seed1.mp4"
}