Breakout Training Loop MVP v1

objective

{
  "name": "Breakout training loop MVP v1",
  "new_policy": "breakout_policy_training_v1",
  "training_location": "Go shadow env",
  "training_entry": "breakout/scripts/train_policy_mvp.py",
  "fixed_config": "breakout/calibration/training_loop_mvp_v1_policy_config.json",
  "execution_environment": "local_fallback_pending_wsl"
}

training

{
  "policy_name": "breakout_policy_training_v1",
  "training_source": "Go shadow env parameter search",
  "selection_rule": "low_score_count asc, score_min desc, score_mean desc",
  "searched_candidates": 216,
  "selected_candidate": {
    "candidate_id": "c182",
    "params": {
      "policy_far_cap": 10.0,
      "policy_near_cap": 18.0,
      "policy_base_deadzone": 2.0,
      "policy_near_y": 165,
      "policy_far_lead": 2.0,
      "policy_near_lead": 3.0,
      "policy_far_deadzone": 2.0,
      "policy_near_deadzone": 1.0,
      "policy_panic_y": 175
    },
    "scores": [
      9.0,
      7.0,
      6.0,
      8.0,
      7.0,
      9.0
    ],
    "score_mean": 7.666666666666667,
    "score_min": 6.0,
    "score_max": 9.0,
    "low_score_count": 0
  },
  "config_path": "breakout/calibration/training_loop_mvp_v1_policy_config.json",
  "rerun_policy_args": {
    "policy": "breakout_policy_training_v1",
    "config": "breakout/calibration/training_loop_mvp_v1_policy_config.json"
  }
}

evaluation

{
  "go": {
    "distribution": {
      "count": 20,
      "scores": [
        9.0,
        7.0,
        6.0,
        8.0,
        7.0,
        9.0,
        8.0,
        9.0,
        8.0,
        7.0,
        9.0,
        8.0,
        7.0,
        6.0,
        7.0,
        7.0,
        7.0,
        6.0,
        7.0,
        7.0
      ],
      "mean": 7.45,
      "median": 7.0,
      "min": 6.0,
      "max": 9.0,
      "stdev": 0.9733961166965892,
      "low_score_count_lt3": 0
    },
    "rows": [
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 0,
        "episode": 0,
        "score": 9,
        "steps": 524,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep000_seed0.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 1,
        "episode": 1,
        "score": 7,
        "steps": 492,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep001_seed1.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 2,
        "episode": 2,
        "score": 6,
        "steps": 467,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep002_seed2.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 3,
        "episode": 3,
        "score": 8,
        "steps": 504,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep003_seed3.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 4,
        "episode": 4,
        "score": 7,
        "steps": 482,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep004_seed4.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 5,
        "episode": 5,
        "score": 9,
        "steps": 524,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep005_seed5.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 6,
        "episode": 6,
        "score": 8,
        "steps": 510,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep006_seed6.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 7,
        "episode": 7,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep007_seed7.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 8,
        "episode": 8,
        "score": 8,
        "steps": 499,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep008_seed8.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 9,
        "episode": 9,
        "score": 7,
        "steps": 488,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep009_seed9.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 10,
        "episode": 10,
        "score": 9,
        "steps": 516,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep010_seed10.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 11,
        "episode": 11,
        "score": 8,
        "steps": 504,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep011_seed11.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 12,
        "episode": 12,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep012_seed12.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 13,
        "episode": 13,
        "score": 6,
        "steps": 465,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep013_seed13.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 14,
        "episode": 14,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep014_seed14.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 15,
        "episode": 15,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep015_seed15.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 16,
        "episode": 16,
        "score": 7,
        "steps": 477,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep016_seed16.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 17,
        "episode": 17,
        "score": 6,
        "steps": 468,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep017_seed17.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 18,
        "episode": 18,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep018_seed18.jsonl"
      },
      {
        "timestamp": "20260510T134809Z",
        "trial_index": 1,
        "policy_name": "breakout_policy_training_v1",
        "seed": 19,
        "episode": 19,
        "score": 7,
        "steps": 484,
        "done": true,
        "replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep019_seed19.jsonl"
      }
    ]
  },
  "atari": {
    "distribution": {
      "count": 20,
      "scores": [
        2.0,
        0.0,
        13.0,
        2.0,
        0.0,
        0.0,
        0.0,
        0.0,
        13.0,
        0.0,
        13.0,
        0.0,
        2.0,
        0.0,
        0.0,
        2.0,
        2.0,
        2.0,
        0.0,
        13.0
      ],
      "mean": 3.2,
      "median": 1.0,
      "min": 0.0,
      "max": 13.0,
      "stdev": 4.975942121849892,
      "low_score_count_lt3": 16
    },
    "rows": [
      {
        "episode": 0,
        "seed": 0,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 1,
        "seed": 1,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 2,
        "seed": 2,
        "score": 13.0,
        "steps": 1800
      },
      {
        "episode": 3,
        "seed": 3,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 4,
        "seed": 4,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 5,
        "seed": 5,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 6,
        "seed": 6,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 7,
        "seed": 7,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 8,
        "seed": 8,
        "score": 13.0,
        "steps": 1800
      },
      {
        "episode": 9,
        "seed": 9,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 10,
        "seed": 10,
        "score": 13.0,
        "steps": 1800
      },
      {
        "episode": 11,
        "seed": 11,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 12,
        "seed": 12,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 13,
        "seed": 13,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 14,
        "seed": 14,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 15,
        "seed": 15,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 16,
        "seed": 16,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 17,
        "seed": 17,
        "score": 2.0,
        "steps": 1800
      },
      {
        "episode": 18,
        "seed": 18,
        "score": 0.0,
        "steps": 1800
      },
      {
        "episode": 19,
        "seed": 19,
        "score": 13.0,
        "steps": 1800
      }
    ]
  },
  "baseline_atari_20_seed": {
    "count": 20,
    "scores": [
      2.0,
      13.0,
      13.0,
      13.0,
      0.0,
      13.0,
      2.0,
      13.0,
      2.0,
      0.0,
      0.0,
      0.0,
      13.0,
      2.0,
      0.0,
      13.0,
      2.0,
      2.0,
      0.0,
      0.0
    ],
    "mean": 5.15,
    "median": 2.0,
    "min": 0.0,
    "max": 13.0,
    "stdev": 5.816141332533109,
    "low_score_threshold": 3.0,
    "low_score_count": 13
  },
  "atari_mean_delta_vs_baseline": -1.9500000000000002
}

decision

{
  "new_policy_not_lower_than_previous_best": false,
  "verdict": "training_failed",
  "failure_reason": "Go-selected candidate improved/optimized shadow-env ranking but did not transfer to the Atari 20-seed benchmark; this matches search_harness_v1's weak Go/Atari rank correlation and env_gap_audit_v1's env_gap diagnosis.",
  "next_recommendation": "Use Atari-validated objectives inside the training loop or address action/life-reset env gaps before relying on Go-only selection."
}

rerun_commands

[
  "curl -fsSL https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_local_artifacts.tgz -o /tmp/training_loop_mvp_v1_local_artifacts.tgz && tar -xzf /tmp/training_loop_mvp_v1_local_artifacts.tgz",
  "python breakout/scripts/train_policy_mvp.py --go-candidates breakout/calibration/reports/policy_search_v2_go_candidates.json",
  "go run ./breakout/cmd/breakout-eval --episodes 20 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --config breakout/calibration/training_loop_mvp_v1_policy_config.json --out-dir breakout/runs-training-loop-mvp-v1 --report breakout/calibration/reports/training_loop_mvp_v1_go_eval.html --notes training-loop-mvp-v1-go",
  "python breakout/scripts/trace_breakout_env.py --episodes 20 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --output breakout/calibration/traces/training_loop_mvp_v1_atari.jsonl --no-features",
  "python breakout/scripts/trace_breakout_env.py --episodes 1 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --output breakout/calibration/traces/training_loop_mvp_v1_atari_seed0_video.jsonl --video-path breakout/calibration/reports/training_loop_mvp_v1_atari_seed0.mp4 --video-every 8 --r2-upload --r2-prefix gemma/breakout",
  "python breakout/scripts/make_training_loop_report.py --policy-manifest breakout/calibration/reports/training_loop_mvp_v1_policy.json --baseline-report breakout/calibration/reports/robust_policy_benchmark_v1_report.json --go-episodes breakout/runs-training-loop-mvp-v1/episodes.jsonl --atari-trace breakout/calibration/traces/training_loop_mvp_v1_atari.jsonl --r2-video-url https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_atari_seed0.mp4 --execution-environment wsl --json-out breakout/calibration/reports/training_loop_mvp_v1_report.json --html-out breakout/calibration/reports/training_loop_mvp_v1_report.html",
  "bash breakout/scripts/run_training_loop_mvp_v1_wsl.sh"
]

evidence

{
  "r2_video_url": "https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_atari_seed0.mp4"
}