Breakout Robust Policy Benchmark v1

objective

{
  "name": "Breakout robust policy benchmark v1",
  "frozen_policy": "breakout_policy_v3",
  "frozen_env": "current Go shadow env after env_gap_audit_v1",
  "selection_reason": "breakout_policy_v3 has the best prior Atari six-seed mean; planner_v1 reduced low-score count but lowered mean."
}

benchmark

{
  "go": {
    "distribution": {
      "count": 20,
      "scores": [
        8.0,
        7.0,
        6.0,
        7.0,
        7.0,
        8.0,
        7.0,
        9.0,
        8.0,
        7.0,
        9.0,
        7.0,
        7.0,
        6.0,
        7.0,
        7.0,
        7.0,
        6.0,
        7.0,
        7.0
      ],
      "mean": 7.2,
      "median": 7.0,
      "min": 6.0,
      "max": 9.0,
      "stdev": 0.812403840463596,
      "low_score_threshold": 3.0,
      "low_score_count": 0
    },
    "rows": [
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 0,
        "episode": 0,
        "score": 8,
        "steps": 505,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep000_seed0.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 1,
        "episode": 1,
        "score": 7,
        "steps": 492,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep001_seed1.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 2,
        "episode": 2,
        "score": 6,
        "steps": 467,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep002_seed2.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 3,
        "episode": 3,
        "score": 7,
        "steps": 483,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep003_seed3.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 4,
        "episode": 4,
        "score": 7,
        "steps": 482,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep004_seed4.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 5,
        "episode": 5,
        "score": 8,
        "steps": 505,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep005_seed5.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 6,
        "episode": 6,
        "score": 7,
        "steps": 485,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep006_seed6.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 7,
        "episode": 7,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep007_seed7.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 8,
        "episode": 8,
        "score": 8,
        "steps": 499,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep008_seed8.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 9,
        "episode": 9,
        "score": 7,
        "steps": 488,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep009_seed9.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 10,
        "episode": 10,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep010_seed10.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 11,
        "episode": 11,
        "score": 7,
        "steps": 483,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep011_seed11.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 12,
        "episode": 12,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep012_seed12.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 13,
        "episode": 13,
        "score": 6,
        "steps": 465,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep013_seed13.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 14,
        "episode": 14,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep014_seed14.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 15,
        "episode": 15,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep015_seed15.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 16,
        "episode": 16,
        "score": 7,
        "steps": 477,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep016_seed16.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 17,
        "episode": 17,
        "score": 6,
        "steps": 468,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep017_seed17.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 18,
        "episode": 18,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep018_seed18.jsonl"
      },
      {
        "timestamp": "20260510T131817Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_v3",
        "seed": 19,
        "episode": 19,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "breakout/runs-robust-policy-benchmark-v1/replays/20260510T131817Z_ep019_seed19.jsonl"
      }
    ],
    "low_score_seeds": []
  },
  "atari": {
    "distribution": {
      "count": 20,
      "scores": [
        2.0,
        13.0,
        13.0,
        13.0,
        0.0,
        13.0,
        2.0,
        13.0,
        2.0,
        0.0,
        0.0,
        0.0,
        13.0,
        2.0,
        0.0,
        13.0,
        2.0,
        2.0,
        0.0,
        0.0
      ],
      "mean": 5.15,
      "median": 2.0,
      "min": 0.0,
      "max": 13.0,
      "stdev": 5.816141332533109,
      "low_score_threshold": 3.0,
      "low_score_count": 13
    },
    "rows": [
      {
        "episode": 0,
        "seed": 0,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 1,
        "seed": 1,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 2,
        "seed": 2,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 3,
        "seed": 3,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 4,
        "seed": 4,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 5,
        "seed": 5,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 6,
        "seed": 6,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 7,
        "seed": 7,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 8,
        "seed": 8,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 9,
        "seed": 9,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 10,
        "seed": 10,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 11,
        "seed": 11,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 12,
        "seed": 12,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 13,
        "seed": 13,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 14,
        "seed": 14,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 15,
        "seed": 15,
        "score": 13.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 40,
        "reward_count": 13
      },
      {
        "episode": 16,
        "seed": 16,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 17,
        "seed": 17,
        "score": 2.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": 32,
        "reward_count": 2
      },
      {
        "episode": 18,
        "seed": 18,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      },
      {
        "episode": 19,
        "seed": 19,
        "score": 0.0,
        "steps": 1800,
        "done": false,
        "first_reward_step": null,
        "reward_count": 0
      }
    ],
    "low_score_seeds": [
      0,
      4,
      6,
      8,
      9,
      10,
      11,
      13,
      14,
      16,
      17,
      18,
      19
    ]
  },
  "paired_by_seed": [
    {
      "seed": 0,
      "go_score": 8.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -6.0
    },
    {
      "seed": 1,
      "go_score": 7.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 6.0
    },
    {
      "seed": 2,
      "go_score": 6.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 7.0
    },
    {
      "seed": 3,
      "go_score": 7.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 6.0
    },
    {
      "seed": 4,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    },
    {
      "seed": 5,
      "go_score": 8.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 5.0
    },
    {
      "seed": 6,
      "go_score": 7.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -5.0
    },
    {
      "seed": 7,
      "go_score": 9.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 4.0
    },
    {
      "seed": 8,
      "go_score": 8.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -6.0
    },
    {
      "seed": 9,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    },
    {
      "seed": 10,
      "go_score": 9.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -9.0
    },
    {
      "seed": 11,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    },
    {
      "seed": 12,
      "go_score": 7.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 6.0
    },
    {
      "seed": 13,
      "go_score": 6.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -4.0
    },
    {
      "seed": 14,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    },
    {
      "seed": 15,
      "go_score": 7.0,
      "atari_score": 13.0,
      "delta_atari_minus_go": 6.0
    },
    {
      "seed": 16,
      "go_score": 7.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -5.0
    },
    {
      "seed": 17,
      "go_score": 6.0,
      "atari_score": 2.0,
      "delta_atari_minus_go": -4.0
    },
    {
      "seed": 18,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    },
    {
      "seed": 19,
      "go_score": 7.0,
      "atari_score": 0.0,
      "delta_atari_minus_go": -7.0
    }
  ]
}

low_score_analysis

{
  "threshold": 3.0,
  "atari_low_score_seeds": [
    0,
    4,
    6,
    8,
    9,
    10,
    11,
    13,
    14,
    16,
    17,
    18,
    19
  ],
  "go_low_score_seeds": [],
  "interpretation": "Atari low-score seeds remain frequent across the 20-seed window, so six-seed gains were not robust."
}

r2_samples

{
  "high_seed": {
    "seed": 1,
    "score": 13,
    "url": "https://file.aimusic.win/gemma/breakout/robust_policy_benchmark_v1_atari_seed1_high.mp4"
  },
  "low_seed": {
    "seed": 4,
    "score": 0,
    "url": "https://file.aimusic.win/gemma/breakout/robust_policy_benchmark_v1_atari_seed4_low.mp4"
  },
  "median_seed": {
    "seed": 0,
    "score": 2,
    "url": "https://file.aimusic.win/gemma/breakout/robust_policy_benchmark_v1_atari_seed0_median.mp4"
  }
}

decision

{
  "go_no_go": "no_go",
  "can_train_mainly_in_go": false,
  "reason": "Atari 20-seed distribution is too unstable for Go-primary training; continue using Go as a coarse filter with substantial Atari validation.",
  "next_recommendation": "Before a training loop, target the env gaps from env_gap_audit_v1 or make the training loop explicitly Atari-validated."
}