Breakout Env Gap Audit v1
objective
{
"name": "Breakout env gap audit v1",
"policy": "breakout_policy_v3",
"go_episodes": "breakout/runs-policy-search-v2/episodes.jsonl",
"atari_trace": "breakout/calibration/traces/search_harness_v1_atari_best.jsonl"
}
comparison
{
"go": {
"episodes": 6,
"score": {
"scores": [
8.0,
7.0,
6.0,
7.0,
7.0,
8.0
],
"mean": 7.166666666666667,
"stdev": 0.6871842709362768,
"min": 6.0,
"max": 8.0
},
"rows": [
{
"episode": 0,
"seed": 0,
"score": 8,
"steps": 505,
"done": true,
"reward_count": 8,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.801587301587301,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.0638613861386137,
"ball_abs_vy_mean": 3.8049504950495048,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
71.5,
211.75
]
},
{
"episode": 1,
"seed": 1,
"score": 7,
"steps": 492,
"done": true,
"reward_count": 7,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.181262729124236,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.1864837398373984,
"ball_abs_vy_mean": 3.4791666666666665,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
66.0,
218.0
]
},
{
"episode": 2,
"seed": 2,
"score": 6,
"steps": 467,
"done": true,
"reward_count": 6,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.76824034334764,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.3228051391862956,
"ball_abs_vy_mean": 3.1145610278372593,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
68.75,
211.75
]
},
{
"episode": 3,
"seed": 3,
"score": 7,
"steps": 483,
"done": true,
"reward_count": 7,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.518672199170124,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.1749482401656315,
"ball_abs_vy_mean": 3.5077639751552794,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
68.0,
211.75
]
},
{
"episode": 4,
"seed": 4,
"score": 7,
"steps": 482,
"done": true,
"reward_count": 7,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.609147609147609,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.1856846473029043,
"ball_abs_vy_mean": 3.479253112033195,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
71.5,
211.75
]
},
{
"episode": 5,
"seed": 5,
"score": 8,
"steps": 505,
"done": true,
"reward_count": 8,
"first_reward_step": 40,
"paddle_abs_delta_mean": 5.801587301587301,
"paddle_abs_delta_max": 10.0,
"ball_abs_vx_mean": 3.0638613861386137,
"ball_abs_vy_mean": 3.8049504950495048,
"ball_x_range": [
0.0,
157.0
],
"ball_y_range": [
71.5,
211.75
]
}
],
"done_count": 6,
"steps_mean": 489.0
},
"atari": {
"episodes": 6,
"score": {
"scores": [
2.0,
13.0,
13.0,
13.0,
0.0,
13.0
],
"mean": 9.0,
"stdev": 5.686240703077327,
"min": 0.0,
"max": 13.0
},
"rows": [
{
"episode": 0,
"seed": 0,
"score": 2.0,
"steps": 1800,
"done": false,
"reward_count": 2,
"first_reward_step": 32,
"paddle_abs_delta_mean": 0.575319622012229,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 0.2784880489160645,
"ball_abs_vy_mean": 0.25472747497219134,
"ball_x_range": [
57.0,
206.0
],
"ball_y_range": [
0.0,
207.0
],
"life_ram_start": 5,
"life_ram_end": 4
},
{
"episode": 1,
"seed": 1,
"score": 13.0,
"steps": 1800,
"done": false,
"reward_count": 13,
"first_reward_step": 40,
"paddle_abs_delta_mean": 11.436909394107838,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 1.7237354085603114,
"ball_abs_vy_mean": 1.4021134593993325,
"ball_x_range": [
56.0,
200.0
],
"ball_y_range": [
0.0,
206.0
],
"life_ram_start": 5,
"life_ram_end": 4
},
{
"episode": 2,
"seed": 2,
"score": 13.0,
"steps": 1800,
"done": false,
"reward_count": 13,
"first_reward_step": 40,
"paddle_abs_delta_mean": 11.436909394107838,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 1.7237354085603114,
"ball_abs_vy_mean": 1.4021134593993325,
"ball_x_range": [
56.0,
200.0
],
"ball_y_range": [
0.0,
206.0
],
"life_ram_start": 5,
"life_ram_end": 4
},
{
"episode": 3,
"seed": 3,
"score": 13.0,
"steps": 1800,
"done": false,
"reward_count": 13,
"first_reward_step": 40,
"paddle_abs_delta_mean": 11.436909394107838,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 1.7237354085603114,
"ball_abs_vy_mean": 1.4021134593993325,
"ball_x_range": [
56.0,
200.0
],
"ball_y_range": [
0.0,
206.0
],
"life_ram_start": 5,
"life_ram_end": 4
},
{
"episode": 4,
"seed": 4,
"score": 0.0,
"steps": 1800,
"done": false,
"reward_count": 0,
"first_reward_step": null,
"paddle_abs_delta_mean": 12.503057254030017,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 0.0528071150639244,
"ball_abs_vy_mean": 0.05116796440489433,
"ball_x_range": [
96.0,
191.0
],
"ball_y_range": [
0.0,
205.0
],
"life_ram_start": 5,
"life_ram_end": 4
},
{
"episode": 5,
"seed": 5,
"score": 13.0,
"steps": 1800,
"done": false,
"reward_count": 13,
"first_reward_step": 40,
"paddle_abs_delta_mean": 11.436909394107838,
"paddle_abs_delta_max": 24.0,
"ball_abs_vx_mean": 1.7237354085603114,
"ball_abs_vy_mean": 1.4021134593993325,
"ball_x_range": [
56.0,
200.0
],
"ball_y_range": [
0.0,
206.0
],
"life_ram_start": 5,
"life_ram_end": 4
}
],
"done_count": 0,
"steps_mean": 1800.0
},
"paired_by_seed": [
{
"seed": 0,
"go_score": 8,
"atari_score": 2.0,
"score_delta_atari_minus_go": -6.0,
"go_steps": 505,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 32,
"go_reward_count": 8,
"atari_reward_count": 2
},
{
"seed": 1,
"go_score": 7,
"atari_score": 13.0,
"score_delta_atari_minus_go": 6.0,
"go_steps": 492,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 7,
"atari_reward_count": 13
},
{
"seed": 2,
"go_score": 6,
"atari_score": 13.0,
"score_delta_atari_minus_go": 7.0,
"go_steps": 467,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 6,
"atari_reward_count": 13
},
{
"seed": 3,
"go_score": 7,
"atari_score": 13.0,
"score_delta_atari_minus_go": 6.0,
"go_steps": 483,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 7,
"atari_reward_count": 13
},
{
"seed": 4,
"go_score": 7,
"atari_score": 0.0,
"score_delta_atari_minus_go": -7.0,
"go_steps": 482,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": null,
"go_reward_count": 7,
"atari_reward_count": 0
},
{
"seed": 5,
"go_score": 8,
"atari_score": 13.0,
"score_delta_atari_minus_go": 5.0,
"go_steps": 505,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 8,
"atari_reward_count": 13
}
]
}
gap_audit
[
{
"gap": "Go/Atari candidate ranking mismatch",
"evidence": {
"pearson": -0.32584242221528265,
"spearman": -0.13989092759813318,
"sample_size": 8
},
"impact_direction": "Go score can promote candidates that Atari ranks poorly; Atari validation remains mandatory.",
"confidence": "high"
},
{
"gap": "Score distribution and seed variance mismatch",
"evidence": {
"go": {
"scores": [
8.0,
7.0,
6.0,
7.0,
7.0,
8.0
],
"mean": 7.166666666666667,
"stdev": 0.6871842709362768,
"min": 6.0,
"max": 8.0
},
"atari": {
"scores": [
2.0,
13.0,
13.0,
13.0,
0.0,
13.0
],
"mean": 9.0,
"stdev": 5.686240703077327,
"min": 0.0,
"max": 13.0
},
"paired_scores": [
{
"seed": 0,
"go_score": 8,
"atari_score": 2.0,
"score_delta_atari_minus_go": -6.0,
"go_steps": 505,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 32,
"go_reward_count": 8,
"atari_reward_count": 2
},
{
"seed": 1,
"go_score": 7,
"atari_score": 13.0,
"score_delta_atari_minus_go": 6.0,
"go_steps": 492,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 7,
"atari_reward_count": 13
},
{
"seed": 2,
"go_score": 6,
"atari_score": 13.0,
"score_delta_atari_minus_go": 7.0,
"go_steps": 467,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 6,
"atari_reward_count": 13
},
{
"seed": 3,
"go_score": 7,
"atari_score": 13.0,
"score_delta_atari_minus_go": 6.0,
"go_steps": 483,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 7,
"atari_reward_count": 13
},
{
"seed": 4,
"go_score": 7,
"atari_score": 0.0,
"score_delta_atari_minus_go": -7.0,
"go_steps": 482,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": null,
"go_reward_count": 7,
"atari_reward_count": 0
},
{
"seed": 5,
"go_score": 8,
"atari_score": 13.0,
"score_delta_atari_minus_go": 5.0,
"go_steps": 505,
"atari_steps": 1800,
"go_first_reward_step": 40,
"atari_first_reward_step": 40,
"go_reward_count": 8,
"atari_reward_count": 13
}
]
},
"impact_direction": "Atari has much higher variance and high-scoring seeds that Go does not reproduce, so Go mean underestimates some Atari candidates while still missing low seeds.",
"confidence": "high"
},
{
"gap": "Episode horizon / life-reset termination mismatch",
"evidence": {
"go_done_count": 6,
"atari_done_count": 0,
"go_steps_mean": 489.0,
"atari_steps_mean": 1800.0
},
"impact_direction": "Go episodes terminate in short fixed-length rallies while Atari traces continue to the evaluation horizon, changing late-rally exposure and score opportunity.",
"confidence": "high"
},
{
"gap": "Ball speed and collision rhythm mismatch",
"evidence": {
"go_abs_vx_mean": 3.1662740897949093,
"atari_abs_vx_mean": 1.204372799703539,
"go_abs_vy_mean": 3.5317742952985682,
"atari_abs_vy_mean": 0.985724879495736
},
"impact_direction": "Different observed velocity scales/rhythms make lead and intercept policies transfer unevenly after paddle or brick contacts.",
"confidence": "medium"
},
{
"gap": "Paddle/action dynamics mismatch",
"evidence": {
"go_paddle_abs_delta_mean": 5.613416247327368,
"atari_paddle_abs_delta_mean": 9.804335742078933
},
"impact_direction": "The same action stream can produce different paddle displacement/lag, especially near misses and serve-reset first active frames.",
"confidence": "medium"
},
{
"gap": "Brick-state / score RAM abstraction mismatch",
"evidence": {
"go_reward_counts": [
8,
7,
6,
7,
7,
8
],
"atari_reward_counts": [
2,
13,
13,
13,
0,
13
],
"atari_life_ram": [
[
5,
4
],
[
5,
4
],
[
5,
4
],
[
5,
4
],
[
5,
4
],
[
5,
4
]
]
},
"impact_direction": "Reward count and RAM life/score progression do not map cleanly to the Go brick grid yet, so score improvements in Go do not guarantee Atari brick progress.",
"confidence": "medium"
}
]
decision
{
"modified_go_env": false,
"reason_no_env_change": "This audit finds multiple plausible gaps, but does not isolate a single env change with a before/after metric. Per goal criteria, no Go env change is applied without targeted evidence.",
"primary_failure_class": "env_gap",
"secondary_failure_class": "policy_timing",
"next_recommendation": "Run targeted audits for action repeat/lag and terminal/life-reset semantics before changing physics or brick parameters."
}
evidence
{
"r2_video_url": "https://file.aimusic.win/gemma/breakout/search_harness_v1_atari_best.mp4"
}