Breakout Env Gap Audit v1

objective

{
  "name": "Breakout env gap audit v1",
  "policy": "breakout_policy_v3",
  "go_episodes": "breakout/runs-policy-search-v2/episodes.jsonl",
  "atari_trace": "breakout/calibration/traces/search_harness_v1_atari_best.jsonl"
}

comparison

{
  "go": {
    "episodes": 6,
    "score": {
      "scores": [
        8.0,
        7.0,
        6.0,
        7.0,
        7.0,
        8.0
      ],
      "mean": 7.166666666666667,
      "stdev": 0.6871842709362768,
      "min": 6.0,
      "max": 8.0
    },
    "rows": [
      {
        "episode": 0,
        "seed": 0,
        "score": 8,
        "steps": 505,
        "done": true,
        "reward_count": 8,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.801587301587301,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.0638613861386137,
        "ball_abs_vy_mean": 3.8049504950495048,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          71.5,
          211.75
        ]
      },
      {
        "episode": 1,
        "seed": 1,
        "score": 7,
        "steps": 492,
        "done": true,
        "reward_count": 7,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.181262729124236,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.1864837398373984,
        "ball_abs_vy_mean": 3.4791666666666665,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          66.0,
          218.0
        ]
      },
      {
        "episode": 2,
        "seed": 2,
        "score": 6,
        "steps": 467,
        "done": true,
        "reward_count": 6,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.76824034334764,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.3228051391862956,
        "ball_abs_vy_mean": 3.1145610278372593,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          68.75,
          211.75
        ]
      },
      {
        "episode": 3,
        "seed": 3,
        "score": 7,
        "steps": 483,
        "done": true,
        "reward_count": 7,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.518672199170124,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.1749482401656315,
        "ball_abs_vy_mean": 3.5077639751552794,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          68.0,
          211.75
        ]
      },
      {
        "episode": 4,
        "seed": 4,
        "score": 7,
        "steps": 482,
        "done": true,
        "reward_count": 7,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.609147609147609,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.1856846473029043,
        "ball_abs_vy_mean": 3.479253112033195,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          71.5,
          211.75
        ]
      },
      {
        "episode": 5,
        "seed": 5,
        "score": 8,
        "steps": 505,
        "done": true,
        "reward_count": 8,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 5.801587301587301,
        "paddle_abs_delta_max": 10.0,
        "ball_abs_vx_mean": 3.0638613861386137,
        "ball_abs_vy_mean": 3.8049504950495048,
        "ball_x_range": [
          0.0,
          157.0
        ],
        "ball_y_range": [
          71.5,
          211.75
        ]
      }
    ],
    "done_count": 6,
    "steps_mean": 489.0
  },
  "atari": {
    "episodes": 6,
    "score": {
      "scores": [
        2.0,
        13.0,
        13.0,
        13.0,
        0.0,
        13.0
      ],
      "mean": 9.0,
      "stdev": 5.686240703077327,
      "min": 0.0,
      "max": 13.0
    },
    "rows": [
      {
        "episode": 0,
        "seed": 0,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "reward_count": 2,
        "first_reward_step": 32,
        "paddle_abs_delta_mean": 0.575319622012229,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 0.2784880489160645,
        "ball_abs_vy_mean": 0.25472747497219134,
        "ball_x_range": [
          57.0,
          206.0
        ],
        "ball_y_range": [
          0.0,
          207.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 1,
        "seed": 1,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 11.436909394107838,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 1.7237354085603114,
        "ball_abs_vy_mean": 1.4021134593993325,
        "ball_x_range": [
          56.0,
          200.0
        ],
        "ball_y_range": [
          0.0,
          206.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 2,
        "seed": 2,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 11.436909394107838,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 1.7237354085603114,
        "ball_abs_vy_mean": 1.4021134593993325,
        "ball_x_range": [
          56.0,
          200.0
        ],
        "ball_y_range": [
          0.0,
          206.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 3,
        "seed": 3,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 11.436909394107838,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 1.7237354085603114,
        "ball_abs_vy_mean": 1.4021134593993325,
        "ball_x_range": [
          56.0,
          200.0
        ],
        "ball_y_range": [
          0.0,
          206.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 4,
        "seed": 4,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "reward_count": 0,
        "first_reward_step": null,
        "paddle_abs_delta_mean": 12.503057254030017,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 0.0528071150639244,
        "ball_abs_vy_mean": 0.05116796440489433,
        "ball_x_range": [
          96.0,
          191.0
        ],
        "ball_y_range": [
          0.0,
          205.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      },
      {
        "episode": 5,
        "seed": 5,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "reward_count": 13,
        "first_reward_step": 40,
        "paddle_abs_delta_mean": 11.436909394107838,
        "paddle_abs_delta_max": 24.0,
        "ball_abs_vx_mean": 1.7237354085603114,
        "ball_abs_vy_mean": 1.4021134593993325,
        "ball_x_range": [
          56.0,
          200.0
        ],
        "ball_y_range": [
          0.0,
          206.0
        ],
        "life_ram_start": 5,
        "life_ram_end": 4
      }
    ],
    "done_count": 0,
    "steps_mean": 1800.0
  },
  "paired_by_seed": [
    {
      "seed": 0,
      "go_score": 8,
      "atari_score": 2.0,
      "score_delta_atari_minus_go": -6.0,
      "go_steps": 505,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": 32,
      "go_reward_count": 8,
      "atari_reward_count": 2
    },
    {
      "seed": 1,
      "go_score": 7,
      "atari_score": 13.0,
      "score_delta_atari_minus_go": 6.0,
      "go_steps": 492,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": 40,
      "go_reward_count": 7,
      "atari_reward_count": 13
    },
    {
      "seed": 2,
      "go_score": 6,
      "atari_score": 13.0,
      "score_delta_atari_minus_go": 7.0,
      "go_steps": 467,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": 40,
      "go_reward_count": 6,
      "atari_reward_count": 13
    },
    {
      "seed": 3,
      "go_score": 7,
      "atari_score": 13.0,
      "score_delta_atari_minus_go": 6.0,
      "go_steps": 483,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": 40,
      "go_reward_count": 7,
      "atari_reward_count": 13
    },
    {
      "seed": 4,
      "go_score": 7,
      "atari_score": 0.0,
      "score_delta_atari_minus_go": -7.0,
      "go_steps": 482,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": null,
      "go_reward_count": 7,
      "atari_reward_count": 0
    },
    {
      "seed": 5,
      "go_score": 8,
      "atari_score": 13.0,
      "score_delta_atari_minus_go": 5.0,
      "go_steps": 505,
      "atari_steps": 1800,
      "go_first_reward_step": 40,
      "atari_first_reward_step": 40,
      "go_reward_count": 8,
      "atari_reward_count": 13
    }
  ]
}

gap_audit

[
  {
    "gap": "Go/Atari candidate ranking mismatch",
    "evidence": {
      "pearson": -0.32584242221528265,
      "spearman": -0.13989092759813318,
      "sample_size": 8
    },
    "impact_direction": "Go score can promote candidates that Atari ranks poorly; Atari validation remains mandatory.",
    "confidence": "high"
  },
  {
    "gap": "Score distribution and seed variance mismatch",
    "evidence": {
      "go": {
        "scores": [
          8.0,
          7.0,
          6.0,
          7.0,
          7.0,
          8.0
        ],
        "mean": 7.166666666666667,
        "stdev": 0.6871842709362768,
        "min": 6.0,
        "max": 8.0
      },
      "atari": {
        "scores": [
          2.0,
          13.0,
          13.0,
          13.0,
          0.0,
          13.0
        ],
        "mean": 9.0,
        "stdev": 5.686240703077327,
        "min": 0.0,
        "max": 13.0
      },
      "paired_scores": [
        {
          "seed": 0,
          "go_score": 8,
          "atari_score": 2.0,
          "score_delta_atari_minus_go": -6.0,
          "go_steps": 505,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": 32,
          "go_reward_count": 8,
          "atari_reward_count": 2
        },
        {
          "seed": 1,
          "go_score": 7,
          "atari_score": 13.0,
          "score_delta_atari_minus_go": 6.0,
          "go_steps": 492,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": 40,
          "go_reward_count": 7,
          "atari_reward_count": 13
        },
        {
          "seed": 2,
          "go_score": 6,
          "atari_score": 13.0,
          "score_delta_atari_minus_go": 7.0,
          "go_steps": 467,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": 40,
          "go_reward_count": 6,
          "atari_reward_count": 13
        },
        {
          "seed": 3,
          "go_score": 7,
          "atari_score": 13.0,
          "score_delta_atari_minus_go": 6.0,
          "go_steps": 483,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": 40,
          "go_reward_count": 7,
          "atari_reward_count": 13
        },
        {
          "seed": 4,
          "go_score": 7,
          "atari_score": 0.0,
          "score_delta_atari_minus_go": -7.0,
          "go_steps": 482,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": null,
          "go_reward_count": 7,
          "atari_reward_count": 0
        },
        {
          "seed": 5,
          "go_score": 8,
          "atari_score": 13.0,
          "score_delta_atari_minus_go": 5.0,
          "go_steps": 505,
          "atari_steps": 1800,
          "go_first_reward_step": 40,
          "atari_first_reward_step": 40,
          "go_reward_count": 8,
          "atari_reward_count": 13
        }
      ]
    },
    "impact_direction": "Atari has much higher variance and high-scoring seeds that Go does not reproduce, so Go mean underestimates some Atari candidates while still missing low seeds.",
    "confidence": "high"
  },
  {
    "gap": "Episode horizon / life-reset termination mismatch",
    "evidence": {
      "go_done_count": 6,
      "atari_done_count": 0,
      "go_steps_mean": 489.0,
      "atari_steps_mean": 1800.0
    },
    "impact_direction": "Go episodes terminate in short fixed-length rallies while Atari traces continue to the evaluation horizon, changing late-rally exposure and score opportunity.",
    "confidence": "high"
  },
  {
    "gap": "Ball speed and collision rhythm mismatch",
    "evidence": {
      "go_abs_vx_mean": 3.1662740897949093,
      "atari_abs_vx_mean": 1.204372799703539,
      "go_abs_vy_mean": 3.5317742952985682,
      "atari_abs_vy_mean": 0.985724879495736
    },
    "impact_direction": "Different observed velocity scales/rhythms make lead and intercept policies transfer unevenly after paddle or brick contacts.",
    "confidence": "medium"
  },
  {
    "gap": "Paddle/action dynamics mismatch",
    "evidence": {
      "go_paddle_abs_delta_mean": 5.613416247327368,
      "atari_paddle_abs_delta_mean": 9.804335742078933
    },
    "impact_direction": "The same action stream can produce different paddle displacement/lag, especially near misses and serve-reset first active frames.",
    "confidence": "medium"
  },
  {
    "gap": "Brick-state / score RAM abstraction mismatch",
    "evidence": {
      "go_reward_counts": [
        8,
        7,
        6,
        7,
        7,
        8
      ],
      "atari_reward_counts": [
        2,
        13,
        13,
        13,
        0,
        13
      ],
      "atari_life_ram": [
        [
          5,
          4
        ],
        [
          5,
          4
        ],
        [
          5,
          4
        ],
        [
          5,
          4
        ],
        [
          5,
          4
        ],
        [
          5,
          4
        ]
      ]
    },
    "impact_direction": "Reward count and RAM life/score progression do not map cleanly to the Go brick grid yet, so score improvements in Go do not guarantee Atari brick progress.",
    "confidence": "medium"
  }
]

decision

{
  "modified_go_env": false,
  "reason_no_env_change": "This audit finds multiple plausible gaps, but does not isolate a single env change with a before/after metric. Per goal criteria, no Go env change is applied without targeted evidence.",
  "primary_failure_class": "env_gap",
  "secondary_failure_class": "policy_timing",
  "next_recommendation": "Run targeted audits for action repeat/lag and terminal/life-reset semantics before changing physics or brick parameters."
}

evidence

{
  "r2_video_url": "https://file.aimusic.win/gemma/breakout/search_harness_v1_atari_best.mp4"
}