{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1282051282051282, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8692.0, "completions/max_terminated_length": 8692.0, "completions/mean_length": 5263.90625, "completions/mean_terminated_length": 5263.90625, "completions/min_length": 2046.0, "completions/min_terminated_length": 2046.0, "epoch": 0.002136752136752137, "grad_norm": 0.13524121910390705, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0183, "num_tokens": 321111.0, "reward": 13.089506149291992, "reward_std": 5.828127861022949, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8307911157608032, "rewards/successful_no_deal_reward_func": 0.453125, "rewards/successful_parsing_reward_func": 0.8847842216491699, "rewards/utility_reward_func": 0.398827463388443, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 8858.0, "completions/max_terminated_length": 8858.0, "completions/mean_length": 5393.53125, "completions/mean_terminated_length": 5567.5161290322585, "completions/min_length": 0.0, "completions/min_terminated_length": 2933.0, "epoch": 0.004273504273504274, "grad_norm": 0.14125326998143348, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "loss": 0.0127, "num_tokens": 681435.0, "reward": 10.29290771484375, "reward_std": 9.241214752197266, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7181423306465149, "rewards/successful_no_deal_reward_func": 0.34375, "rewards/successful_parsing_reward_func": 0.7481849789619446, "rewards/utility_reward_func": 0.31008416414260864, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10108.0, "completions/max_terminated_length": 10108.0, "completions/mean_length": 5457.53125, "completions/mean_terminated_length": 5633.580645161291, "completions/min_length": 0.0, "completions/min_terminated_length": 2403.0, "epoch": 0.00641025641025641, "grad_norm": 0.12193342337851935, "kl": 0.003643035888671875, "learning_rate": 8.000000000000001e-06, "loss": -0.0081, "num_tokens": 1012660.0, "reward": 13.357504844665527, "reward_std": 5.522988796234131, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8510912656784058, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.9299927949905396, "rewards/utility_reward_func": 0.40858402848243713, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10325.0, "completions/max_terminated_length": 10325.0, "completions/mean_length": 5183.53125, "completions/mean_terminated_length": 5350.741935483871, "completions/min_length": 0.0, "completions/min_terminated_length": 2541.0, "epoch": 0.008547008547008548, "grad_norm": 0.09992499424145265, "kl": 0.0036802291870117188, "learning_rate": 1.2e-05, "loss": -0.0117, "num_tokens": 1352107.0, "reward": 11.811685562133789, "reward_std": 5.946799278259277, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8307291865348816, "rewards/successful_no_deal_reward_func": 0.390625, "rewards/successful_parsing_reward_func": 0.8160351514816284, "rewards/utility_reward_func": 0.35854610800743103, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10002.0, "completions/max_terminated_length": 10002.0, "completions/mean_length": 6005.25, "completions/mean_terminated_length": 6198.967741935484, "completions/min_length": 0.0, "completions/min_terminated_length": 2885.0, "epoch": 0.010683760683760684, "grad_norm": 0.09868345069469578, "kl": 0.0058193206787109375, "learning_rate": 1.6000000000000003e-05, "loss": 0.0168, "num_tokens": 1706933.0, "reward": 6.7459492683410645, "reward_std": 3.5795390605926514, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8775534629821777, "rewards/successful_no_deal_reward_func": 0.578125, "rewards/successful_parsing_reward_func": 0.9091060161590576, "rewards/utility_reward_func": 0.18297193944454193, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10113.0, "completions/max_terminated_length": 10113.0, "completions/mean_length": 5148.71875, "completions/mean_terminated_length": 5314.806451612903, "completions/min_length": 0.0, "completions/min_terminated_length": 2663.0, "epoch": 0.01282051282051282, "grad_norm": 0.07418858759257596, "kl": 0.014812469482421875, "learning_rate": 2e-05, "loss": 0.019, "num_tokens": 2024143.0, "reward": 7.305600643157959, "reward_std": 2.82883358001709, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8985738754272461, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8988806009292603, "rewards/utility_reward_func": 0.2021118402481079, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9059.0, "completions/max_terminated_length": 9059.0, "completions/mean_length": 5224.0625, "completions/mean_terminated_length": 5572.333333333333, "completions/min_length": 0.0, "completions/min_terminated_length": 2033.0, "epoch": 0.014957264957264958, "grad_norm": 0.0770740966039767, "kl": 0.023563385009765625, "learning_rate": 2e-05, "loss": -0.0037, "num_tokens": 2368266.0, "reward": 7.440418243408203, "reward_std": 3.8883907794952393, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.7994791865348816, "rewards/successful_no_deal_reward_func": 0.53125, "rewards/successful_parsing_reward_func": 0.8668774366378784, "rewards/utility_reward_func": 0.20834481716156006, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10161.0, "completions/max_terminated_length": 10161.0, "completions/mean_length": 6345.90625, "completions/mean_terminated_length": 6345.90625, "completions/min_length": 3192.0, "completions/min_terminated_length": 3192.0, "epoch": 0.017094017094017096, "grad_norm": 0.13080769358982583, "kl": 0.03223419189453125, "learning_rate": 2e-05, "loss": 0.0234, "num_tokens": 2727219.0, "reward": 6.599692344665527, "reward_std": 3.7256505489349365, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8149601221084595, "rewards/successful_no_deal_reward_func": 0.59375, "rewards/successful_parsing_reward_func": 0.8814631700515747, "rewards/utility_reward_func": 0.17787665128707886, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8574.0, "completions/max_terminated_length": 8574.0, "completions/mean_length": 5724.71875, "completions/mean_terminated_length": 5724.71875, "completions/min_length": 3321.0, "completions/min_terminated_length": 3321.0, "epoch": 0.019230769230769232, "grad_norm": 0.2718145594331439, "kl": 0.06011962890625, "learning_rate": 2e-05, "loss": 0.0504, "num_tokens": 3052863.0, "reward": 2.8698644638061523, "reward_std": 1.5825773477554321, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9517256021499634, "rewards/successful_no_deal_reward_func": 0.84375, "rewards/successful_parsing_reward_func": 0.946114182472229, "rewards/utility_reward_func": 0.04454435780644417, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9763.0, "completions/max_terminated_length": 9763.0, "completions/mean_length": 5663.1875, "completions/mean_terminated_length": 5663.1875, "completions/min_length": 2606.0, "completions/min_terminated_length": 2606.0, "epoch": 0.021367521367521368, "grad_norm": 0.1574477673918289, "kl": 0.047687530517578125, "learning_rate": 2e-05, "loss": -0.0117, "num_tokens": 3387072.0, "reward": 2.5430192947387695, "reward_std": 1.3082820177078247, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9754464626312256, "rewards/successful_no_deal_reward_func": 0.8125, "rewards/successful_parsing_reward_func": 0.9150803089141846, "rewards/utility_reward_func": 0.034715551882982254, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9994.0, "completions/max_terminated_length": 9994.0, "completions/mean_length": 5518.375, "completions/mean_terminated_length": 5886.266666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 3232.0, "epoch": 0.023504273504273504, "grad_norm": 0.14592223795177592, "kl": 0.050079345703125, "learning_rate": 2e-05, "loss": -0.0075, "num_tokens": 3740116.0, "reward": 2.410231828689575, "reward_std": 0.9183810949325562, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9459468126296997, "rewards/successful_no_deal_reward_func": 0.828125, "rewards/successful_parsing_reward_func": 0.8924851417541504, "rewards/utility_reward_func": 0.029942119494080544, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9545.0, "completions/max_terminated_length": 9545.0, "completions/mean_length": 6346.125, "completions/mean_terminated_length": 6346.125, "completions/min_length": 3742.0, "completions/min_terminated_length": 3742.0, "epoch": 0.02564102564102564, "grad_norm": 3.311026969817496, "kl": 0.34075927734375, "learning_rate": 2e-05, "loss": 0.0373, "num_tokens": 4094936.0, "reward": 2.6738317012786865, "reward_std": 0.92072594165802, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8866342306137085, "rewards/successful_no_deal_reward_func": 0.84375, "rewards/successful_parsing_reward_func": 0.9354434013366699, "rewards/utility_reward_func": 0.038262464106082916, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10170.0, "completions/max_terminated_length": 10170.0, "completions/mean_length": 5580.75, "completions/mean_terminated_length": 5580.75, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "epoch": 0.027777777777777776, "grad_norm": 18.64484255853941, "kl": 0.9246597290039062, "learning_rate": 2e-05, "loss": 0.0331, "num_tokens": 4433282.0, "reward": 12.720104217529297, "reward_std": 8.21227741241455, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7824472188949585, "rewards/successful_no_deal_reward_func": 0.40625, "rewards/successful_parsing_reward_func": 0.8727678060531616, "rewards/utility_reward_func": 0.38827773928642273, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9335.0, "completions/max_terminated_length": 9335.0, "completions/mean_length": 5595.25, "completions/mean_terminated_length": 5595.25, "completions/min_length": 2111.0, "completions/min_terminated_length": 2111.0, "epoch": 0.029914529914529916, "grad_norm": 0.07969555942261197, "kl": 0.0496368408203125, "learning_rate": 2e-05, "loss": 0.0091, "num_tokens": 4779367.0, "reward": 13.972558975219727, "reward_std": 6.738990306854248, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8297809958457947, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.8079355955123901, "rewards/utility_reward_func": 0.4298241138458252, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 8997.0, "completions/max_terminated_length": 8997.0, "completions/mean_length": 5286.6875, "completions/mean_terminated_length": 5457.225806451613, "completions/min_length": 0.0, "completions/min_terminated_length": 2698.0, "epoch": 0.03205128205128205, "grad_norm": 0.06101731197561454, "kl": 0.05796051025390625, "learning_rate": 2e-05, "loss": 0.0022, "num_tokens": 5122315.0, "reward": 15.198302268981934, "reward_std": 7.127710342407227, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8440600633621216, "rewards/successful_no_deal_reward_func": 0.4375, "rewards/successful_parsing_reward_func": 0.8397558331489563, "rewards/utility_reward_func": 0.4697473347187042, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 9528.0, "completions/max_terminated_length": 9528.0, "completions/mean_length": 5658.65625, "completions/mean_terminated_length": 5841.193548387097, "completions/min_length": 0.0, "completions/min_terminated_length": 2804.0, "epoch": 0.03418803418803419, "grad_norm": 0.1035264599440814, "kl": 0.056243896484375, "learning_rate": 2e-05, "loss": 0.0203, "num_tokens": 5477111.0, "reward": 12.287153244018555, "reward_std": 6.766469955444336, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7561259865760803, "rewards/successful_no_deal_reward_func": 0.390625, "rewards/successful_parsing_reward_func": 0.8038890957832336, "rewards/utility_reward_func": 0.37468421459198, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9185.0, "completions/max_terminated_length": 9185.0, "completions/mean_length": 4713.1875, "completions/mean_terminated_length": 5027.4, "completions/min_length": 0.0, "completions/min_terminated_length": 2639.0, "epoch": 0.03632478632478633, "grad_norm": 0.0433494341400444, "kl": 0.070892333984375, "learning_rate": 2e-05, "loss": -0.0119, "num_tokens": 5795811.0, "reward": 8.16100788116455, "reward_std": 2.8902506828308105, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9079613089561462, "rewards/successful_no_deal_reward_func": 0.453125, "rewards/successful_parsing_reward_func": 0.8510779142379761, "rewards/utility_reward_func": 0.23439928889274597, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 9330.0, "completions/max_terminated_length": 9330.0, "completions/mean_length": 5696.59375, "completions/mean_terminated_length": 5880.354838709677, "completions/min_length": 0.0, "completions/min_terminated_length": 2147.0, "epoch": 0.038461538461538464, "grad_norm": 0.07053711116404891, "kl": 0.065277099609375, "learning_rate": 2e-05, "loss": 0.0277, "num_tokens": 6150385.0, "reward": 7.081500053405762, "reward_std": 4.265430927276611, "rewards/constraint_satisfaction_reward_func": 0.96875, "rewards/successful_code_execution_reward_func": 0.7555555701255798, "rewards/successful_no_deal_reward_func": 0.375, "rewards/successful_parsing_reward_func": 0.862622857093811, "rewards/utility_reward_func": 0.20201024413108826, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10173.0, "completions/max_terminated_length": 10173.0, "completions/mean_length": 6548.21875, "completions/mean_terminated_length": 6548.21875, "completions/min_length": 2994.0, "completions/min_terminated_length": 2994.0, "epoch": 0.0405982905982906, "grad_norm": 0.09672399682608723, "kl": 0.069854736328125, "learning_rate": 2e-05, "loss": 0.0237, "num_tokens": 6513232.0, "reward": 7.472269058227539, "reward_std": 3.3596060276031494, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8278397917747498, "rewards/successful_no_deal_reward_func": 0.4375, "rewards/successful_parsing_reward_func": 0.9014575481414795, "rewards/utility_reward_func": 0.2120613008737564, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10364.0, "completions/max_terminated_length": 10364.0, "completions/mean_length": 5375.28125, "completions/mean_terminated_length": 5548.677419354839, "completions/min_length": 0.0, "completions/min_terminated_length": 2140.0, "epoch": 0.042735042735042736, "grad_norm": 0.04520712883700852, "kl": 0.0749664306640625, "learning_rate": 2e-05, "loss": -0.0097, "num_tokens": 6855140.0, "reward": 9.597513198852539, "reward_std": 3.5805306434631348, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8538690805435181, "rewards/successful_no_deal_reward_func": 0.453125, "rewards/successful_parsing_reward_func": 0.8160218596458435, "rewards/utility_reward_func": 0.28284043073654175, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9343.0, "completions/max_terminated_length": 9343.0, "completions/mean_length": 5354.9375, "completions/mean_terminated_length": 5711.933333333333, "completions/min_length": 0.0, "completions/min_terminated_length": 2588.0, "epoch": 0.04487179487179487, "grad_norm": 0.13324734003119365, "kl": 0.07419586181640625, "learning_rate": 2e-05, "loss": 0.0278, "num_tokens": 7204532.0, "reward": 8.765981674194336, "reward_std": 5.346878528594971, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.7741690874099731, "rewards/successful_no_deal_reward_func": 0.515625, "rewards/successful_parsing_reward_func": 0.8718490600585938, "rewards/utility_reward_func": 0.25311893224716187, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8788.0, "completions/max_terminated_length": 8788.0, "completions/mean_length": 5734.65625, "completions/mean_terminated_length": 5734.65625, "completions/min_length": 3138.0, "completions/min_terminated_length": 3138.0, "epoch": 0.04700854700854701, "grad_norm": 0.04722558093607279, "kl": 0.0833892822265625, "learning_rate": 2e-05, "loss": 0.0062, "num_tokens": 7540679.0, "reward": 10.483989715576172, "reward_std": 2.554959297180176, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8787944912910461, "rewards/successful_no_deal_reward_func": 0.546875, "rewards/successful_parsing_reward_func": 0.898411750793457, "rewards/utility_reward_func": 0.3089068531990051, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8693.0, "completions/max_terminated_length": 8693.0, "completions/mean_length": 5031.03125, "completions/mean_terminated_length": 5031.03125, "completions/min_length": 2639.0, "completions/min_terminated_length": 2639.0, "epoch": 0.049145299145299144, "grad_norm": 0.07640969114726487, "kl": 0.08179473876953125, "learning_rate": 2e-05, "loss": 0.0515, "num_tokens": 7855954.0, "reward": 10.10434341430664, "reward_std": 2.6132378578186035, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8531621694564819, "rewards/successful_no_deal_reward_func": 0.609375, "rewards/successful_parsing_reward_func": 0.864062488079071, "rewards/utility_reward_func": 0.29410821199417114, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9740.0, "completions/max_terminated_length": 9740.0, "completions/mean_length": 6291.0625, "completions/mean_terminated_length": 6291.0625, "completions/min_length": 2627.0, "completions/min_terminated_length": 2627.0, "epoch": 0.05128205128205128, "grad_norm": 0.0976001686813385, "kl": 0.0815277099609375, "learning_rate": 2e-05, "loss": -0.0036, "num_tokens": 8217282.0, "reward": 10.006271362304688, "reward_std": 4.209831237792969, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.779737114906311, "rewards/successful_no_deal_reward_func": 0.578125, "rewards/successful_parsing_reward_func": 0.8936668634414673, "rewards/utility_reward_func": 0.2920268774032593, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8830.0, "completions/max_terminated_length": 8830.0, "completions/mean_length": 6104.3125, "completions/mean_terminated_length": 6104.3125, "completions/min_length": 3625.0, "completions/min_terminated_length": 3625.0, "epoch": 0.053418803418803416, "grad_norm": 0.0874243589318564, "kl": 0.082855224609375, "learning_rate": 2e-05, "loss": 0.0239, "num_tokens": 8576582.0, "reward": 3.0176899433135986, "reward_std": 1.4519190788269043, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7582464814186096, "rewards/successful_no_deal_reward_func": 0.625, "rewards/successful_parsing_reward_func": 0.893723726272583, "rewards/utility_reward_func": 0.05758309364318848, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 8970.0, "completions/max_terminated_length": 8970.0, "completions/mean_length": 5440.53125, "completions/mean_terminated_length": 6003.3448275862065, "completions/min_length": 0.0, "completions/min_terminated_length": 3231.0, "epoch": 0.05555555555555555, "grad_norm": 0.19827483452745304, "kl": 0.111328125, "learning_rate": 2e-05, "loss": -0.0174, "num_tokens": 8944229.0, "reward": 2.325728178024292, "reward_std": 1.4577386379241943, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.791290819644928, "rewards/successful_no_deal_reward_func": 0.59375, "rewards/successful_parsing_reward_func": 0.888812243938446, "rewards/utility_reward_func": 0.03546559810638428, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10391.0, "completions/max_terminated_length": 10391.0, "completions/mean_length": 5703.25, "completions/mean_terminated_length": 5703.25, "completions/min_length": 2291.0, "completions/min_terminated_length": 2291.0, "epoch": 0.057692307692307696, "grad_norm": 0.07807090906536783, "kl": 0.07916259765625, "learning_rate": 2e-05, "loss": -0.0036, "num_tokens": 9298581.0, "reward": 2.7880592346191406, "reward_std": 1.5752832889556885, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8456540703773499, "rewards/successful_no_deal_reward_func": 0.65625, "rewards/successful_parsing_reward_func": 0.8798115253448486, "rewards/utility_reward_func": 0.04864208400249481, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 9834.0, "completions/max_terminated_length": 9834.0, "completions/mean_length": 5653.6875, "completions/mean_terminated_length": 5836.064516129032, "completions/min_length": 0.0, "completions/min_terminated_length": 2561.0, "epoch": 0.05982905982905983, "grad_norm": 0.04981913139295697, "kl": 0.0860748291015625, "learning_rate": 2e-05, "loss": -0.0036, "num_tokens": 9645929.0, "reward": 2.931215763092041, "reward_std": 2.0780129432678223, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9096354246139526, "rewards/successful_no_deal_reward_func": 0.65625, "rewards/successful_parsing_reward_func": 0.9329229593276978, "rewards/utility_reward_func": 0.05302366986870766, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10930.0, "completions/max_terminated_length": 10930.0, "completions/mean_length": 5840.9375, "completions/mean_terminated_length": 5840.9375, "completions/min_length": 2371.0, "completions/min_terminated_length": 2371.0, "epoch": 0.06196581196581197, "grad_norm": 0.43603578566490725, "kl": 0.163726806640625, "learning_rate": 2e-05, "loss": 0.0195, "num_tokens": 10014133.0, "reward": 11.773115158081055, "reward_std": 6.266353607177734, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.724196195602417, "rewards/successful_no_deal_reward_func": 0.359375, "rewards/successful_parsing_reward_func": 0.7893494367599487, "rewards/utility_reward_func": 0.35874617099761963, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9489.0, "completions/max_terminated_length": 9489.0, "completions/mean_length": 5918.46875, "completions/mean_terminated_length": 5918.46875, "completions/min_length": 2629.0, "completions/min_terminated_length": 2629.0, "epoch": 0.0641025641025641, "grad_norm": 1.5094761839926756, "kl": 0.1993255615234375, "learning_rate": 2e-05, "loss": -0.0006, "num_tokens": 10389894.0, "reward": 10.936625480651855, "reward_std": 4.752138137817383, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7032060623168945, "rewards/successful_no_deal_reward_func": 0.359375, "rewards/successful_parsing_reward_func": 0.7675862312316895, "rewards/utility_reward_func": 0.331005722284317, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 11193.0, "completions/max_terminated_length": 11193.0, "completions/mean_length": 5057.46875, "completions/mean_terminated_length": 5220.612903225807, "completions/min_length": 0.0, "completions/min_terminated_length": 1654.0, "epoch": 0.06623931623931624, "grad_norm": 0.045208953033285613, "kl": 0.082183837890625, "learning_rate": 2e-05, "loss": -0.0222, "num_tokens": 10722626.0, "reward": 12.012799263000488, "reward_std": 5.420868873596191, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9069940447807312, "rewards/successful_no_deal_reward_func": 0.4375, "rewards/successful_parsing_reward_func": 0.7996008992195129, "rewards/utility_reward_func": 0.36348801851272583, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10599.0, "completions/max_terminated_length": 10599.0, "completions/mean_length": 5630.34375, "completions/mean_terminated_length": 5811.967741935484, "completions/min_length": 0.0, "completions/min_terminated_length": 2850.0, "epoch": 0.06837606837606838, "grad_norm": 0.045623682683562025, "kl": 0.0789337158203125, "learning_rate": 2e-05, "loss": -0.0068, "num_tokens": 11085010.0, "reward": 10.26258659362793, "reward_std": 7.012184143066406, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8258308172225952, "rewards/successful_no_deal_reward_func": 0.375, "rewards/successful_parsing_reward_func": 0.8364831209182739, "rewards/utility_reward_func": 0.30737853050231934, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 8891.0, "completions/max_terminated_length": 8891.0, "completions/mean_length": 4892.84375, "completions/mean_terminated_length": 5050.677419354839, "completions/min_length": 0.0, "completions/min_terminated_length": 2852.0, "epoch": 0.07051282051282051, "grad_norm": 0.05892387943921885, "kl": 0.0875091552734375, "learning_rate": 2e-05, "loss": 0.0001, "num_tokens": 11423845.0, "reward": 9.62917423248291, "reward_std": 4.717038631439209, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8024696111679077, "rewards/successful_no_deal_reward_func": 0.4375, "rewards/successful_parsing_reward_func": 0.809061586856842, "rewards/utility_reward_func": 0.2846111059188843, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9959.0, "completions/max_terminated_length": 9959.0, "completions/mean_length": 5024.46875, "completions/mean_terminated_length": 5024.46875, "completions/min_length": 2542.0, "completions/min_terminated_length": 2542.0, "epoch": 0.07264957264957266, "grad_norm": 0.04089145594446082, "kl": 0.084930419921875, "learning_rate": 2e-05, "loss": -0.0046, "num_tokens": 11749425.0, "reward": 9.058746337890625, "reward_std": 3.817065715789795, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8897197246551514, "rewards/successful_no_deal_reward_func": 0.46875, "rewards/successful_parsing_reward_func": 0.8192257285118103, "rewards/utility_reward_func": 0.26423048973083496, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9616.0, "completions/max_terminated_length": 9616.0, "completions/mean_length": 5304.84375, "completions/mean_terminated_length": 5304.84375, "completions/min_length": 2674.0, "completions/min_terminated_length": 2674.0, "epoch": 0.07478632478632478, "grad_norm": 0.31033660874854463, "kl": 0.0949249267578125, "learning_rate": 2e-05, "loss": -0.006, "num_tokens": 12101010.0, "reward": 9.125823974609375, "reward_std": 3.129967212677002, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8177951574325562, "rewards/successful_no_deal_reward_func": 0.40625, "rewards/successful_parsing_reward_func": 0.7546502351760864, "rewards/utility_reward_func": 0.2687443196773529, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9969.0, "completions/max_terminated_length": 9969.0, "completions/mean_length": 5238.09375, "completions/mean_terminated_length": 5587.3, "completions/min_length": 0.0, "completions/min_terminated_length": 3429.0, "epoch": 0.07692307692307693, "grad_norm": 0.03256188449171459, "kl": 0.0983734130859375, "learning_rate": 2e-05, "loss": 0.0206, "num_tokens": 12461361.0, "reward": 8.622406959533691, "reward_std": 3.5288777351379395, "rewards/constraint_satisfaction_reward_func": 0.953125, "rewards/successful_code_execution_reward_func": 0.9133804440498352, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.8293843865394592, "rewards/utility_reward_func": 0.2516564726829529, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9716.0, "completions/max_terminated_length": 9716.0, "completions/mean_length": 5179.75, "completions/mean_terminated_length": 5525.066666666667, "completions/min_length": 0.0, "completions/min_terminated_length": 3120.0, "epoch": 0.07905982905982906, "grad_norm": 0.04889712846302911, "kl": 0.073455810546875, "learning_rate": 2e-05, "loss": 0.0066, "num_tokens": 12817439.0, "reward": 8.959070205688477, "reward_std": 6.533388614654541, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8700520992279053, "rewards/successful_no_deal_reward_func": 0.40625, "rewards/successful_parsing_reward_func": 0.8051215410232544, "rewards/utility_reward_func": 0.26284343004226685, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 9623.0, "completions/max_terminated_length": 9623.0, "completions/mean_length": 5002.8125, "completions/mean_terminated_length": 5520.3448275862065, "completions/min_length": 0.0, "completions/min_terminated_length": 1646.0, "epoch": 0.0811965811965812, "grad_norm": 0.04748836707714437, "kl": 0.07926177978515625, "learning_rate": 2e-05, "loss": 0.0052, "num_tokens": 13177566.0, "reward": 9.607915878295898, "reward_std": 6.371920585632324, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7991071343421936, "rewards/successful_no_deal_reward_func": 0.34375, "rewards/successful_parsing_reward_func": 0.8020461201667786, "rewards/utility_reward_func": 0.28680169582366943, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 9332.0, "completions/max_terminated_length": 9332.0, "completions/mean_length": 5509.34375, "completions/mean_terminated_length": 5687.064516129032, "completions/min_length": 0.0, "completions/min_terminated_length": 3683.0, "epoch": 0.08333333333333333, "grad_norm": 0.08479434572785938, "kl": 0.08233642578125, "learning_rate": 2e-05, "loss": 0.0238, "num_tokens": 13541530.0, "reward": 8.508003234863281, "reward_std": 5.044344902038574, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.7092013359069824, "rewards/successful_no_deal_reward_func": 0.34375, "rewards/successful_parsing_reward_func": 0.7772454619407654, "rewards/utility_reward_func": 0.2505202889442444, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 10125.0, "completions/max_terminated_length": 10125.0, "completions/mean_length": 5366.65625, "completions/mean_terminated_length": 5921.827586206897, "completions/min_length": 0.0, "completions/min_terminated_length": 2721.0, "epoch": 0.08547008547008547, "grad_norm": 0.046629736962909236, "kl": 0.074920654296875, "learning_rate": 2e-05, "loss": -0.0057, "num_tokens": 13930554.0, "reward": 9.409976959228516, "reward_std": 6.964659690856934, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8676711320877075, "rewards/successful_no_deal_reward_func": 0.34375, "rewards/successful_parsing_reward_func": 0.7561174035072327, "rewards/utility_reward_func": 0.28012827038764954, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10197.0, "completions/max_terminated_length": 10197.0, "completions/mean_length": 5269.65625, "completions/mean_terminated_length": 5439.645161290323, "completions/min_length": 0.0, "completions/min_terminated_length": 2822.0, "epoch": 0.0876068376068376, "grad_norm": 0.07532493246421096, "kl": 0.0913238525390625, "learning_rate": 2e-05, "loss": 0.0518, "num_tokens": 14267395.0, "reward": 10.74399185180664, "reward_std": 7.217482566833496, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8700148463249207, "rewards/successful_no_deal_reward_func": 0.40625, "rewards/successful_parsing_reward_func": 0.872061014175415, "rewards/utility_reward_func": 0.32211780548095703, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9754.0, "completions/max_terminated_length": 9754.0, "completions/mean_length": 5356.21875, "completions/mean_terminated_length": 5713.3, "completions/min_length": 0.0, "completions/min_terminated_length": 2306.0, "epoch": 0.08974358974358974, "grad_norm": 0.041994088388066786, "kl": 0.08880615234375, "learning_rate": 2e-05, "loss": -0.0166, "num_tokens": 14631225.0, "reward": 13.259390830993652, "reward_std": 4.738910675048828, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8736110925674438, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.789508581161499, "rewards/utility_reward_func": 0.40570682287216187, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 8233.0, "completions/max_terminated_length": 8233.0, "completions/mean_length": 4819.65625, "completions/mean_terminated_length": 5140.966666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 1512.0, "epoch": 0.09188034188034189, "grad_norm": 0.03565747695965463, "kl": 0.084442138671875, "learning_rate": 2e-05, "loss": -0.015, "num_tokens": 14967668.0, "reward": 11.417985916137695, "reward_std": 5.845874786376953, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8795138597488403, "rewards/successful_no_deal_reward_func": 0.40625, "rewards/successful_parsing_reward_func": 0.7768093347549438, "rewards/utility_reward_func": 0.34487012028694153, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9785.0, "completions/max_terminated_length": 9785.0, "completions/mean_length": 6039.5, "completions/mean_terminated_length": 6039.5, "completions/min_length": 2790.0, "completions/min_terminated_length": 2790.0, "epoch": 0.09401709401709402, "grad_norm": 0.04878414442627355, "kl": 0.0800628662109375, "learning_rate": 2e-05, "loss": 0.0123, "num_tokens": 15322843.0, "reward": 10.717945098876953, "reward_std": 6.875738143920898, "rewards/constraint_satisfaction_reward_func": 0.96875, "rewards/successful_code_execution_reward_func": 0.8796254992485046, "rewards/successful_no_deal_reward_func": 0.390625, "rewards/successful_parsing_reward_func": 0.8528273701667786, "rewards/utility_reward_func": 0.3223233222961426, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 10441.0, "completions/max_terminated_length": 10441.0, "completions/mean_length": 5009.0, "completions/mean_terminated_length": 5527.172413793103, "completions/min_length": 0.0, "completions/min_terminated_length": 2133.0, "epoch": 0.09615384615384616, "grad_norm": 0.043346101484315204, "kl": 0.090057373046875, "learning_rate": 2e-05, "loss": -0.0255, "num_tokens": 15675494.0, "reward": 6.5747551918029785, "reward_std": 4.603181838989258, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8890997171401978, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8411458730697632, "rewards/utility_reward_func": 0.1782347708940506, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10244.0, "completions/max_terminated_length": 10244.0, "completions/mean_length": 5903.40625, "completions/mean_terminated_length": 5903.40625, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "epoch": 0.09829059829059829, "grad_norm": 0.05335262654380793, "kl": 0.0872802734375, "learning_rate": 2e-05, "loss": 0.0124, "num_tokens": 16026738.0, "reward": 7.486398220062256, "reward_std": 4.8832197189331055, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8818700313568115, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.836823582649231, "rewards/utility_reward_func": 0.20840096473693848, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10041.0, "completions/max_terminated_length": 10041.0, "completions/mean_length": 6264.78125, "completions/mean_terminated_length": 6264.78125, "completions/min_length": 2898.0, "completions/min_terminated_length": 2898.0, "epoch": 0.10042735042735043, "grad_norm": 0.06631556073214734, "kl": 0.08795166015625, "learning_rate": 2e-05, "loss": -0.0024, "num_tokens": 16382497.0, "reward": 7.394876480102539, "reward_std": 5.015570640563965, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.7953249216079712, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8512028455734253, "rewards/utility_reward_func": 0.20585119724273682, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10786.0, "completions/max_terminated_length": 10786.0, "completions/mean_length": 5614.28125, "completions/mean_terminated_length": 5795.387096774193, "completions/min_length": 0.0, "completions/min_terminated_length": 2599.0, "epoch": 0.10256410256410256, "grad_norm": 0.02442773109163022, "kl": 0.1008758544921875, "learning_rate": 2e-05, "loss": -0.0175, "num_tokens": 16727867.0, "reward": 9.46392822265625, "reward_std": 5.264253616333008, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9590773582458496, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8479910492897034, "rewards/utility_reward_func": 0.2740240693092346, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 10011.0, "completions/max_terminated_length": 10011.0, "completions/mean_length": 6587.53125, "completions/mean_terminated_length": 6800.032258064516, "completions/min_length": 0.0, "completions/min_terminated_length": 4411.0, "epoch": 0.1047008547008547, "grad_norm": 0.04667915631702583, "kl": 0.084930419921875, "learning_rate": 2e-05, "loss": -0.0377, "num_tokens": 17136101.0, "reward": 2.485010862350464, "reward_std": 0.9829912781715393, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9278645515441895, "rewards/successful_no_deal_reward_func": 0.640625, "rewards/successful_parsing_reward_func": 0.7758680582046509, "rewards/utility_reward_func": 0.03913374990224838, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 10045.0, "completions/max_terminated_length": 10045.0, "completions/mean_length": 6914.71875, "completions/mean_terminated_length": 7630.0344827586205, "completions/min_length": 0.0, "completions/min_terminated_length": 3860.0, "epoch": 0.10683760683760683, "grad_norm": 0.05353420751511945, "kl": 0.088897705078125, "learning_rate": 2e-05, "loss": -0.0106, "num_tokens": 17544134.0, "reward": 2.416043758392334, "reward_std": 1.4705603122711182, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.8893229365348816, "rewards/successful_no_deal_reward_func": 0.625, "rewards/successful_parsing_reward_func": 0.8974916934967041, "rewards/utility_reward_func": 0.037078749388456345, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 10681.0, "completions/max_terminated_length": 10681.0, "completions/mean_length": 7199.34375, "completions/mean_terminated_length": 7679.3, "completions/min_length": 0.0, "completions/min_terminated_length": 3120.0, "epoch": 0.10897435897435898, "grad_norm": 0.03946040934713408, "kl": 0.0816192626953125, "learning_rate": 2e-05, "loss": -0.0015, "num_tokens": 17958479.0, "reward": 2.968362331390381, "reward_std": 1.3150081634521484, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9317336082458496, "rewards/successful_no_deal_reward_func": 0.6875, "rewards/successful_parsing_reward_func": 0.9003890752792358, "rewards/utility_reward_func": 0.053254999220371246, "step": 51 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 10872.0, "completions/max_terminated_length": 10872.0, "completions/mean_length": 6536.28125, "completions/mean_terminated_length": 7212.448275862069, "completions/min_length": 0.0, "completions/min_terminated_length": 4263.0, "epoch": 0.1111111111111111, "grad_norm": 0.03718996563528003, "kl": NaN, "learning_rate": 2e-05, "loss": -0.0024, "num_tokens": 18366031.0, "reward": 2.8633077144622803, "reward_std": 1.4587385654449463, "rewards/constraint_satisfaction_reward_func": 0.96875, "rewards/successful_code_execution_reward_func": 0.9080356955528259, "rewards/successful_no_deal_reward_func": 0.625, "rewards/successful_parsing_reward_func": 0.8871028423309326, "rewards/utility_reward_func": 0.05248062312602997, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 8489.0, "completions/max_terminated_length": 8489.0, "completions/mean_length": 5517.625, "completions/mean_terminated_length": 5885.466666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 3481.0, "epoch": 0.11324786324786325, "grad_norm": 0.07000245223651237, "kl": 0.095703125, "learning_rate": 2e-05, "loss": -0.0067, "num_tokens": 18727862.0, "reward": 9.138480186462402, "reward_std": 5.334235668182373, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.8934895992279053, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8407135009765625, "rewards/utility_reward_func": 0.2636790871620178, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 10454.0, "completions/max_terminated_length": 10454.0, "completions/mean_length": 5502.0625, "completions/mean_terminated_length": 6071.241379310345, "completions/min_length": 0.0, "completions/min_terminated_length": 3344.0, "epoch": 0.11538461538461539, "grad_norm": 0.15958852057504191, "kl": 0.130096435546875, "learning_rate": 2e-05, "loss": -0.029, "num_tokens": 19105506.0, "reward": 8.657028198242188, "reward_std": 5.453444480895996, "rewards/constraint_satisfaction_reward_func": 0.984375, "rewards/successful_code_execution_reward_func": 0.9398065805435181, "rewards/successful_no_deal_reward_func": 0.546875, "rewards/successful_parsing_reward_func": 0.7914034128189087, "rewards/utility_reward_func": 0.24816150963306427, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 10317.0, "completions/max_terminated_length": 10317.0, "completions/mean_length": 5991.5, "completions/mean_terminated_length": 6611.310344827586, "completions/min_length": 0.0, "completions/min_terminated_length": 3320.0, "epoch": 0.11752136752136752, "grad_norm": 0.06674299870442284, "kl": 0.09130859375, "learning_rate": 2e-05, "loss": -0.032, "num_tokens": 19481400.0, "reward": 10.987039566040039, "reward_std": 4.5664963722229, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9111378192901611, "rewards/successful_no_deal_reward_func": 0.5625, "rewards/successful_parsing_reward_func": 0.8698784708976746, "rewards/utility_reward_func": 0.32488125562667847, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 11647.0, "completions/max_terminated_length": 11647.0, "completions/mean_length": 5948.34375, "completions/mean_terminated_length": 6563.689655172414, "completions/min_length": 0.0, "completions/min_terminated_length": 3207.0, "epoch": 0.11965811965811966, "grad_norm": 0.06381001518888112, "kl": 0.09112548828125, "learning_rate": 2e-05, "loss": -0.006, "num_tokens": 19854767.0, "reward": 8.135985374450684, "reward_std": 4.6707611083984375, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9406249523162842, "rewards/successful_no_deal_reward_func": 0.515625, "rewards/successful_parsing_reward_func": 0.8900917768478394, "rewards/utility_reward_func": 0.23124295473098755, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9240.0, "completions/max_terminated_length": 9240.0, "completions/mean_length": 5740.6875, "completions/mean_terminated_length": 5740.6875, "completions/min_length": 2702.0, "completions/min_terminated_length": 2702.0, "epoch": 0.12179487179487179, "grad_norm": 0.04339549040283508, "kl": 0.100372314453125, "learning_rate": 2e-05, "loss": 0.008, "num_tokens": 20191894.0, "reward": 8.311476707458496, "reward_std": 4.060470104217529, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9272196888923645, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.8717137575149536, "rewards/utility_reward_func": 0.24032360315322876, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 11047.0, "completions/max_terminated_length": 11047.0, "completions/mean_length": 6486.1875, "completions/mean_terminated_length": 6918.6, "completions/min_length": 0.0, "completions/min_terminated_length": 2191.0, "epoch": 0.12393162393162394, "grad_norm": 0.047643632225391426, "kl": 0.0866241455078125, "learning_rate": 2e-05, "loss": -0.0208, "num_tokens": 20580540.0, "reward": 9.361209869384766, "reward_std": 4.704495906829834, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9267857074737549, "rewards/successful_no_deal_reward_func": 0.421875, "rewards/successful_parsing_reward_func": 0.8713778257369995, "rewards/utility_reward_func": 0.2753172516822815, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 9325.0, "completions/max_terminated_length": 9325.0, "completions/mean_length": 5433.3125, "completions/mean_terminated_length": 5795.533333333334, "completions/min_length": 0.0, "completions/min_terminated_length": 3548.0, "epoch": 0.12606837606837606, "grad_norm": 0.06527076242487305, "kl": 0.100128173828125, "learning_rate": 2e-05, "loss": -0.0595, "num_tokens": 20929760.0, "reward": 9.511280059814453, "reward_std": 2.9424219131469727, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9066296219825745, "rewards/successful_no_deal_reward_func": 0.453125, "rewards/successful_parsing_reward_func": 0.8526886701583862, "rewards/utility_reward_func": 0.27940744161605835, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 9701.0, "completions/max_terminated_length": 9701.0, "completions/mean_length": 5489.21875, "completions/mean_terminated_length": 6057.068965517241, "completions/min_length": 0.0, "completions/min_terminated_length": 3070.0, "epoch": 0.1282051282051282, "grad_norm": 0.06022558654926435, "kl": 0.087799072265625, "learning_rate": 2e-05, "loss": -0.0458, "num_tokens": 21305760.0, "reward": 8.836292266845703, "reward_std": 4.981121063232422, "rewards/constraint_satisfaction_reward_func": 1.0, "rewards/successful_code_execution_reward_func": 0.9498511552810669, "rewards/successful_no_deal_reward_func": 0.390625, "rewards/successful_parsing_reward_func": 0.8065916299819946, "rewards/utility_reward_func": 0.2590007781982422, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 21305760, "num_train_epochs": 1, "save_steps": 15, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }