Breakout Training Loop MVP v1
objective
{
"name": "Breakout training loop MVP v1",
"new_policy": "breakout_policy_training_v1",
"training_location": "Go shadow env",
"training_entry": "breakout/scripts/train_policy_mvp.py",
"fixed_config": "breakout/calibration/training_loop_mvp_v1_policy_config.json",
"execution_environment": "local_fallback_pending_wsl"
}
training
{
"policy_name": "breakout_policy_training_v1",
"training_source": "Go shadow env parameter search",
"selection_rule": "low_score_count asc, score_min desc, score_mean desc",
"searched_candidates": 216,
"selected_candidate": {
"candidate_id": "c182",
"params": {
"policy_far_cap": 10.0,
"policy_near_cap": 18.0,
"policy_base_deadzone": 2.0,
"policy_near_y": 165,
"policy_far_lead": 2.0,
"policy_near_lead": 3.0,
"policy_far_deadzone": 2.0,
"policy_near_deadzone": 1.0,
"policy_panic_y": 175
},
"scores": [
9.0,
7.0,
6.0,
8.0,
7.0,
9.0
],
"score_mean": 7.666666666666667,
"score_min": 6.0,
"score_max": 9.0,
"low_score_count": 0
},
"config_path": "breakout/calibration/training_loop_mvp_v1_policy_config.json",
"rerun_policy_args": {
"policy": "breakout_policy_training_v1",
"config": "breakout/calibration/training_loop_mvp_v1_policy_config.json"
}
}
evaluation
{
"go": {
"distribution": {
"count": 20,
"scores": [
9.0,
7.0,
6.0,
8.0,
7.0,
9.0,
8.0,
9.0,
8.0,
7.0,
9.0,
8.0,
7.0,
6.0,
7.0,
7.0,
7.0,
6.0,
7.0,
7.0
],
"mean": 7.45,
"median": 7.0,
"min": 6.0,
"max": 9.0,
"stdev": 0.9733961166965892,
"low_score_count_lt3": 0
},
"rows": [
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 0,
"episode": 0,
"score": 9,
"steps": 524,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep000_seed0.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 1,
"episode": 1,
"score": 7,
"steps": 492,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep001_seed1.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 2,
"episode": 2,
"score": 6,
"steps": 467,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep002_seed2.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 3,
"episode": 3,
"score": 8,
"steps": 504,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep003_seed3.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 4,
"episode": 4,
"score": 7,
"steps": 482,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep004_seed4.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 5,
"episode": 5,
"score": 9,
"steps": 524,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep005_seed5.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 6,
"episode": 6,
"score": 8,
"steps": 510,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep006_seed6.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 7,
"episode": 7,
"score": 9,
"steps": 516,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep007_seed7.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 8,
"episode": 8,
"score": 8,
"steps": 499,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep008_seed8.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 9,
"episode": 9,
"score": 7,
"steps": 488,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep009_seed9.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 10,
"episode": 10,
"score": 9,
"steps": 516,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep010_seed10.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 11,
"episode": 11,
"score": 8,
"steps": 504,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep011_seed11.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 12,
"episode": 12,
"score": 7,
"steps": 484,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep012_seed12.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 13,
"episode": 13,
"score": 6,
"steps": 465,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep013_seed13.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 14,
"episode": 14,
"score": 7,
"steps": 484,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep014_seed14.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 15,
"episode": 15,
"score": 7,
"steps": 484,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep015_seed15.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 16,
"episode": 16,
"score": 7,
"steps": 477,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep016_seed16.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 17,
"episode": 17,
"score": 6,
"steps": 468,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep017_seed17.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 18,
"episode": 18,
"score": 7,
"steps": 484,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep018_seed18.jsonl"
},
{
"timestamp": "20260510T134809Z",
"trial_index": 1,
"policy_name": "breakout_policy_training_v1",
"seed": 19,
"episode": 19,
"score": 7,
"steps": 484,
"done": true,
"replay_path": "/tmp/training-loop-local/replays/20260510T134809Z_ep019_seed19.jsonl"
}
]
},
"atari": {
"distribution": {
"count": 20,
"scores": [
2.0,
0.0,
13.0,
2.0,
0.0,
0.0,
0.0,
0.0,
13.0,
0.0,
13.0,
0.0,
2.0,
0.0,
0.0,
2.0,
2.0,
2.0,
0.0,
13.0
],
"mean": 3.2,
"median": 1.0,
"min": 0.0,
"max": 13.0,
"stdev": 4.975942121849892,
"low_score_count_lt3": 16
},
"rows": [
{
"episode": 0,
"seed": 0,
"score": 2.0,
"steps": 1800
},
{
"episode": 1,
"seed": 1,
"score": 0.0,
"steps": 1800
},
{
"episode": 2,
"seed": 2,
"score": 13.0,
"steps": 1800
},
{
"episode": 3,
"seed": 3,
"score": 2.0,
"steps": 1800
},
{
"episode": 4,
"seed": 4,
"score": 0.0,
"steps": 1800
},
{
"episode": 5,
"seed": 5,
"score": 0.0,
"steps": 1800
},
{
"episode": 6,
"seed": 6,
"score": 0.0,
"steps": 1800
},
{
"episode": 7,
"seed": 7,
"score": 0.0,
"steps": 1800
},
{
"episode": 8,
"seed": 8,
"score": 13.0,
"steps": 1800
},
{
"episode": 9,
"seed": 9,
"score": 0.0,
"steps": 1800
},
{
"episode": 10,
"seed": 10,
"score": 13.0,
"steps": 1800
},
{
"episode": 11,
"seed": 11,
"score": 0.0,
"steps": 1800
},
{
"episode": 12,
"seed": 12,
"score": 2.0,
"steps": 1800
},
{
"episode": 13,
"seed": 13,
"score": 0.0,
"steps": 1800
},
{
"episode": 14,
"seed": 14,
"score": 0.0,
"steps": 1800
},
{
"episode": 15,
"seed": 15,
"score": 2.0,
"steps": 1800
},
{
"episode": 16,
"seed": 16,
"score": 2.0,
"steps": 1800
},
{
"episode": 17,
"seed": 17,
"score": 2.0,
"steps": 1800
},
{
"episode": 18,
"seed": 18,
"score": 0.0,
"steps": 1800
},
{
"episode": 19,
"seed": 19,
"score": 13.0,
"steps": 1800
}
]
},
"baseline_atari_20_seed": {
"count": 20,
"scores": [
2.0,
13.0,
13.0,
13.0,
0.0,
13.0,
2.0,
13.0,
2.0,
0.0,
0.0,
0.0,
13.0,
2.0,
0.0,
13.0,
2.0,
2.0,
0.0,
0.0
],
"mean": 5.15,
"median": 2.0,
"min": 0.0,
"max": 13.0,
"stdev": 5.816141332533109,
"low_score_threshold": 3.0,
"low_score_count": 13
},
"atari_mean_delta_vs_baseline": -1.9500000000000002
}
decision
{
"new_policy_not_lower_than_previous_best": false,
"verdict": "training_failed",
"failure_reason": "Go-selected candidate improved/optimized shadow-env ranking but did not transfer to the Atari 20-seed benchmark; this matches search_harness_v1's weak Go/Atari rank correlation and env_gap_audit_v1's env_gap diagnosis.",
"next_recommendation": "Use Atari-validated objectives inside the training loop or address action/life-reset env gaps before relying on Go-only selection."
}
rerun_commands
[
"curl -fsSL https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_local_artifacts.tgz -o /tmp/training_loop_mvp_v1_local_artifacts.tgz && tar -xzf /tmp/training_loop_mvp_v1_local_artifacts.tgz",
"python breakout/scripts/train_policy_mvp.py --go-candidates breakout/calibration/reports/policy_search_v2_go_candidates.json",
"go run ./breakout/cmd/breakout-eval --episodes 20 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --config breakout/calibration/training_loop_mvp_v1_policy_config.json --out-dir breakout/runs-training-loop-mvp-v1 --report breakout/calibration/reports/training_loop_mvp_v1_go_eval.html --notes training-loop-mvp-v1-go",
"python breakout/scripts/trace_breakout_env.py --episodes 20 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --output breakout/calibration/traces/training_loop_mvp_v1_atari.jsonl --no-features",
"python breakout/scripts/trace_breakout_env.py --episodes 1 --seed 0 --max-steps 1800 --policy breakout_policy_training_v1 --output breakout/calibration/traces/training_loop_mvp_v1_atari_seed0_video.jsonl --video-path breakout/calibration/reports/training_loop_mvp_v1_atari_seed0.mp4 --video-every 8 --r2-upload --r2-prefix gemma/breakout",
"python breakout/scripts/make_training_loop_report.py --policy-manifest breakout/calibration/reports/training_loop_mvp_v1_policy.json --baseline-report breakout/calibration/reports/robust_policy_benchmark_v1_report.json --go-episodes breakout/runs-training-loop-mvp-v1/episodes.jsonl --atari-trace breakout/calibration/traces/training_loop_mvp_v1_atari.jsonl --r2-video-url https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_atari_seed0.mp4 --execution-environment wsl --json-out breakout/calibration/reports/training_loop_mvp_v1_report.json --html-out breakout/calibration/reports/training_loop_mvp_v1_report.html",
"bash breakout/scripts/run_training_loop_mvp_v1_wsl.sh"
]
evidence
{
"r2_video_url": "https://file.aimusic.win/gemma/breakout/training_loop_mvp_v1_atari_seed0.mp4"
}