{ "best_metric": 1.8457005023956299, "best_model_checkpoint": "/home/co-guru1/rds/hpc-work/LowLR-Qwen2.5-7B-1M-SFT-LongStory/checkpoint-70", "epoch": 9.892430278884461, "eval_steps": 1, "global_step": 70, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.035127639770508, "eval_mean_token_accuracy": 0.5458261826891958, "eval_runtime": 76.7545, "eval_samples_per_second": 2.111, "eval_steps_per_second": 1.055, "step": 0 }, { "epoch": 0.12749003984063745, "grad_norm": 0.07416678220033646, "learning_rate": 7.142857142857142e-08, "loss": 2.0719, "mean_token_accuracy": 0.5373116647824645, "step": 1 }, { "epoch": 0.12749003984063745, "eval_loss": 2.035127639770508, "eval_mean_token_accuracy": 0.5458261826891958, "eval_runtime": 74.1634, "eval_samples_per_second": 2.184, "eval_steps_per_second": 1.092, "step": 1 }, { "epoch": 0.2549800796812749, "grad_norm": 0.07404134422540665, "learning_rate": 1.4285714285714285e-07, "loss": 2.0533, "mean_token_accuracy": 0.5427800826728344, "step": 2 }, { "epoch": 0.2549800796812749, "eval_loss": 2.0350606441497803, "eval_mean_token_accuracy": 0.5461185964537255, "eval_runtime": 73.5204, "eval_samples_per_second": 2.203, "eval_steps_per_second": 1.102, "step": 2 }, { "epoch": 0.38247011952191234, "grad_norm": 0.07219956815242767, "learning_rate": 2.1428571428571426e-07, "loss": 2.0689, "mean_token_accuracy": 0.5374418077990413, "step": 3 }, { "epoch": 0.38247011952191234, "eval_loss": 2.0350587368011475, "eval_mean_token_accuracy": 0.5459356370531483, "eval_runtime": 73.6504, "eval_samples_per_second": 2.2, "eval_steps_per_second": 1.1, "step": 3 }, { "epoch": 0.5099601593625498, "grad_norm": 0.07335151731967926, "learning_rate": 2.857142857142857e-07, "loss": 2.033, "mean_token_accuracy": 0.5425024246796966, "step": 4 }, { "epoch": 0.5099601593625498, "eval_loss": 2.034891128540039, "eval_mean_token_accuracy": 0.5458230953893544, "eval_runtime": 73.5173, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 4 }, { "epoch": 0.6374501992031872, "grad_norm": 0.07420691847801208, "learning_rate": 3.5714285714285716e-07, "loss": 2.0834, "mean_token_accuracy": 0.5379714276641607, "step": 5 }, { "epoch": 0.6374501992031872, "eval_loss": 2.034177780151367, "eval_mean_token_accuracy": 0.545881042877833, "eval_runtime": 73.6853, "eval_samples_per_second": 2.199, "eval_steps_per_second": 1.099, "step": 5 }, { "epoch": 0.7649402390438247, "grad_norm": 0.0740814208984375, "learning_rate": 4.285714285714285e-07, "loss": 2.1092, "mean_token_accuracy": 0.5330054322257638, "step": 6 }, { "epoch": 0.7649402390438247, "eval_loss": 2.0311665534973145, "eval_mean_token_accuracy": 0.5464062517807807, "eval_runtime": 73.5144, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 6 }, { "epoch": 0.8924302788844621, "grad_norm": 0.07146468013525009, "learning_rate": 5e-07, "loss": 2.0672, "mean_token_accuracy": 0.5403047879226506, "step": 7 }, { "epoch": 0.8924302788844621, "eval_loss": 2.0302398204803467, "eval_mean_token_accuracy": 0.5466494666941372, "eval_runtime": 73.5075, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 7 }, { "epoch": 1.1274900398406373, "grad_norm": 0.09253820776939392, "learning_rate": 4.996892303047305e-07, "loss": 4.1859, "mean_token_accuracy": 0.5366367085112466, "step": 8 }, { "epoch": 1.1274900398406373, "eval_loss": 2.02187180519104, "eval_mean_token_accuracy": 0.5480462982330794, "eval_runtime": 73.4698, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 8 }, { "epoch": 1.254980079681275, "grad_norm": 0.06704003363847733, "learning_rate": 4.987576938413504e-07, "loss": 2.0616, "mean_token_accuracy": 0.5415953188203275, "step": 9 }, { "epoch": 1.254980079681275, "eval_loss": 2.018803119659424, "eval_mean_token_accuracy": 0.5481804793263659, "eval_runtime": 76.9062, "eval_samples_per_second": 2.106, "eval_steps_per_second": 1.053, "step": 9 }, { "epoch": 1.3824701195219125, "grad_norm": 0.06263462454080582, "learning_rate": 4.972077065562821e-07, "loss": 2.0686, "mean_token_accuracy": 0.5379139776341617, "step": 10 }, { "epoch": 1.3824701195219125, "eval_loss": 2.0164647102355957, "eval_mean_token_accuracy": 0.5484072577070307, "eval_runtime": 73.4726, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 10 }, { "epoch": 1.5099601593625498, "grad_norm": 0.06307435035705566, "learning_rate": 4.950431219621359e-07, "loss": 2.0461, "mean_token_accuracy": 0.5415830966085196, "step": 11 }, { "epoch": 1.5099601593625498, "eval_loss": 2.015308141708374, "eval_mean_token_accuracy": 0.5487934564366753, "eval_runtime": 73.593, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 11 }, { "epoch": 1.6374501992031871, "grad_norm": 0.05914280563592911, "learning_rate": 4.922693215572695e-07, "loss": 2.0466, "mean_token_accuracy": 0.5427031978033483, "step": 12 }, { "epoch": 1.6374501992031871, "eval_loss": 1.9927903413772583, "eval_mean_token_accuracy": 0.5514989881603806, "eval_runtime": 73.4718, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 12 }, { "epoch": 1.7649402390438247, "grad_norm": 0.048744626343250275, "learning_rate": 4.888932014465352e-07, "loss": 2.0442, "mean_token_accuracy": 0.5398744908161461, "step": 13 }, { "epoch": 1.7649402390438247, "eval_loss": 1.986499547958374, "eval_mean_token_accuracy": 0.5522575499834838, "eval_runtime": 73.5034, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 13 }, { "epoch": 1.8924302788844622, "grad_norm": 0.04861805588006973, "learning_rate": 4.849231551964771e-07, "loss": 2.0421, "mean_token_accuracy": 0.5407598949968815, "step": 14 }, { "epoch": 1.8924302788844622, "eval_loss": 1.984315037727356, "eval_mean_token_accuracy": 0.5522749854458703, "eval_runtime": 73.4806, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 14 }, { "epoch": 2.1274900398406373, "grad_norm": 0.05290145054459572, "learning_rate": 4.803690529676019e-07, "loss": 3.9856, "mean_token_accuracy": 0.5472545198820256, "step": 15 }, { "epoch": 2.1274900398406373, "eval_loss": 1.9803493022918701, "eval_mean_token_accuracy": 0.552716988472291, "eval_runtime": 73.8712, "eval_samples_per_second": 2.193, "eval_steps_per_second": 1.097, "step": 15 }, { "epoch": 2.2549800796812747, "grad_norm": 0.04447222873568535, "learning_rate": 4.752422169756047e-07, "loss": 2.0272, "mean_token_accuracy": 0.5439375299029052, "step": 16 }, { "epoch": 2.2549800796812747, "eval_loss": 1.9769712686538696, "eval_mean_token_accuracy": 0.5530401924510061, "eval_runtime": 73.4892, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 16 }, { "epoch": 2.3824701195219125, "grad_norm": 0.04153960198163986, "learning_rate": 4.695553933425571e-07, "loss": 2.0168, "mean_token_accuracy": 0.5457877600565553, "step": 17 }, { "epoch": 2.3824701195219125, "eval_loss": 1.9759660959243774, "eval_mean_token_accuracy": 0.553164764686867, "eval_runtime": 73.5981, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 17 }, { "epoch": 2.50996015936255, "grad_norm": 0.04174751043319702, "learning_rate": 4.6332272040803887e-07, "loss": 2.0161, "mean_token_accuracy": 0.5440750285051763, "step": 18 }, { "epoch": 2.50996015936255, "eval_loss": 1.9742294549942017, "eval_mean_token_accuracy": 0.553374731246336, "eval_runtime": 73.5155, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 18 }, { "epoch": 2.637450199203187, "grad_norm": 0.04268491640686989, "learning_rate": 4.565596935789987e-07, "loss": 2.0268, "mean_token_accuracy": 0.5423221732489765, "step": 19 }, { "epoch": 2.637450199203187, "eval_loss": 1.9633668661117554, "eval_mean_token_accuracy": 0.5546030968795588, "eval_runtime": 73.5997, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 19 }, { "epoch": 2.764940239043825, "grad_norm": 0.03843478113412857, "learning_rate": 4.492831268057306e-07, "loss": 2.005, "mean_token_accuracy": 0.5442297370173037, "step": 20 }, { "epoch": 2.764940239043825, "eval_loss": 1.9385652542114258, "eval_mean_token_accuracy": 0.5579991222899637, "eval_runtime": 73.4712, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 20 }, { "epoch": 2.8924302788844622, "grad_norm": 0.029193470254540443, "learning_rate": 4.415111107797445e-07, "loss": 1.9772, "mean_token_accuracy": 0.5513248736970127, "step": 21 }, { "epoch": 2.8924302788844622, "eval_loss": 1.9281940460205078, "eval_mean_token_accuracy": 0.5597583088609908, "eval_runtime": 73.4871, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 21 }, { "epoch": 3.1274900398406373, "grad_norm": 0.025040559470653534, "learning_rate": 4.332629679574565e-07, "loss": 3.9529, "mean_token_accuracy": 0.5513296248736205, "step": 22 }, { "epoch": 3.1274900398406373, "eval_loss": 1.923688292503357, "eval_mean_token_accuracy": 0.5601817553426013, "eval_runtime": 73.4846, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 22 }, { "epoch": 3.2549800796812747, "grad_norm": 0.02398320473730564, "learning_rate": 4.2455920452151814e-07, "loss": 1.9894, "mean_token_accuracy": 0.5493731172755361, "step": 23 }, { "epoch": 3.2549800796812747, "eval_loss": 1.922356128692627, "eval_mean_token_accuracy": 0.5604678300427802, "eval_runtime": 73.5182, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 23 }, { "epoch": 3.3824701195219125, "grad_norm": 0.023711789399385452, "learning_rate": 4.154214593992149e-07, "loss": 1.9463, "mean_token_accuracy": 0.5566007480956614, "step": 24 }, { "epoch": 3.3824701195219125, "eval_loss": 1.921160101890564, "eval_mean_token_accuracy": 0.5606214830904831, "eval_runtime": 73.5054, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 24 }, { "epoch": 3.50996015936255, "grad_norm": 0.023766979575157166, "learning_rate": 4.058724504646834e-07, "loss": 1.9467, "mean_token_accuracy": 0.5527740670368075, "step": 25 }, { "epoch": 3.50996015936255, "eval_loss": 1.9200737476348877, "eval_mean_token_accuracy": 0.5609120343938286, "eval_runtime": 73.4838, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 25 }, { "epoch": 3.637450199203187, "grad_norm": 0.02387434057891369, "learning_rate": 3.959359180586975e-07, "loss": 1.9272, "mean_token_accuracy": 0.5584028149023652, "step": 26 }, { "epoch": 3.637450199203187, "eval_loss": 1.918907642364502, "eval_mean_token_accuracy": 0.5609360878114347, "eval_runtime": 73.4895, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 26 }, { "epoch": 3.764940239043825, "grad_norm": 0.023042624816298485, "learning_rate": 3.8563656596643985e-07, "loss": 1.9747, "mean_token_accuracy": 0.5482216733507812, "step": 27 }, { "epoch": 3.764940239043825, "eval_loss": 1.9176579713821411, "eval_mean_token_accuracy": 0.5610209037492304, "eval_runtime": 73.6473, "eval_samples_per_second": 2.2, "eval_steps_per_second": 1.1, "step": 27 }, { "epoch": 3.8924302788844622, "grad_norm": 0.023782329633831978, "learning_rate": 3.75e-07, "loss": 1.9593, "mean_token_accuracy": 0.5541580212302506, "step": 28 }, { "epoch": 3.8924302788844622, "eval_loss": 1.9162002801895142, "eval_mean_token_accuracy": 0.5613347873275663, "eval_runtime": 73.4764, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 28 }, { "epoch": 4.127490039840637, "grad_norm": 0.02967219427227974, "learning_rate": 3.6405266433829073e-07, "loss": 3.9311, "mean_token_accuracy": 0.5519657228831891, "step": 29 }, { "epoch": 4.127490039840637, "eval_loss": 1.9146673679351807, "eval_mean_token_accuracy": 0.56137576294534, "eval_runtime": 73.5335, "eval_samples_per_second": 2.203, "eval_steps_per_second": 1.102, "step": 29 }, { "epoch": 4.254980079681275, "grad_norm": 0.02214796096086502, "learning_rate": 3.528217757826529e-07, "loss": 1.9819, "mean_token_accuracy": 0.5509966965764761, "step": 30 }, { "epoch": 4.254980079681275, "eval_loss": 1.9132657051086426, "eval_mean_token_accuracy": 0.5618137069690374, "eval_runtime": 73.5205, "eval_samples_per_second": 2.203, "eval_steps_per_second": 1.102, "step": 30 }, { "epoch": 4.382470119521912, "grad_norm": 0.02208121307194233, "learning_rate": 3.413352560915988e-07, "loss": 1.9336, "mean_token_accuracy": 0.5579158738255501, "step": 31 }, { "epoch": 4.382470119521912, "eval_loss": 1.9118901491165161, "eval_mean_token_accuracy": 0.561793370379342, "eval_runtime": 73.6018, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 31 }, { "epoch": 4.509960159362549, "grad_norm": 0.02247035503387451, "learning_rate": 3.296216625629211e-07, "loss": 1.9772, "mean_token_accuracy": 0.5505059747956693, "step": 32 }, { "epoch": 4.509960159362549, "eval_loss": 1.910597801208496, "eval_mean_token_accuracy": 0.562171799901091, "eval_runtime": 73.4852, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 32 }, { "epoch": 4.637450199203188, "grad_norm": 0.022233208641409874, "learning_rate": 3.177101170357513e-07, "loss": 1.909, "mean_token_accuracy": 0.5625110901892185, "step": 33 }, { "epoch": 4.637450199203188, "eval_loss": 1.9094303846359253, "eval_mean_token_accuracy": 0.5623352579128595, "eval_runtime": 73.5903, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 33 }, { "epoch": 4.764940239043825, "grad_norm": 0.02191471867263317, "learning_rate": 3.056302334890786e-07, "loss": 1.926, "mean_token_accuracy": 0.5563776316121221, "step": 34 }, { "epoch": 4.764940239043825, "eval_loss": 1.9082125425338745, "eval_mean_token_accuracy": 0.5626369957570676, "eval_runtime": 73.5119, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 34 }, { "epoch": 4.892430278884462, "grad_norm": 0.02139287069439888, "learning_rate": 2.934120444167326e-07, "loss": 1.9414, "mean_token_accuracy": 0.5555704743601382, "step": 35 }, { "epoch": 4.892430278884462, "eval_loss": 1.9068336486816406, "eval_mean_token_accuracy": 0.5629910878193232, "eval_runtime": 73.6059, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.1, "step": 35 }, { "epoch": 5.127490039840637, "grad_norm": 0.025543633848428726, "learning_rate": 2.810859261618713e-07, "loss": 3.8919, "mean_token_accuracy": 0.5556861454689944, "step": 36 }, { "epoch": 5.127490039840637, "eval_loss": 1.9028369188308716, "eval_mean_token_accuracy": 0.56360407486374, "eval_runtime": 73.4926, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 36 }, { "epoch": 5.254980079681275, "grad_norm": 0.020656954497098923, "learning_rate": 2.6868252339660607e-07, "loss": 1.9414, "mean_token_accuracy": 0.5566462711431086, "step": 37 }, { "epoch": 5.254980079681275, "eval_loss": 1.9000654220581055, "eval_mean_token_accuracy": 0.5638575491345958, "eval_runtime": 73.6267, "eval_samples_per_second": 2.2, "eval_steps_per_second": 1.1, "step": 37 }, { "epoch": 5.382470119521912, "grad_norm": 0.020553426817059517, "learning_rate": 2.5623267293451823e-07, "loss": 1.9282, "mean_token_accuracy": 0.5593453152105212, "step": 38 }, { "epoch": 5.382470119521912, "eval_loss": 1.8970355987548828, "eval_mean_token_accuracy": 0.5647362948935709, "eval_runtime": 73.4901, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 38 }, { "epoch": 5.509960159362549, "grad_norm": 0.019633032381534576, "learning_rate": 2.437673270654818e-07, "loss": 1.9395, "mean_token_accuracy": 0.557382661383599, "step": 39 }, { "epoch": 5.509960159362549, "eval_loss": 1.8939744234085083, "eval_mean_token_accuracy": 0.5657094058431225, "eval_runtime": 75.4531, "eval_samples_per_second": 2.147, "eval_steps_per_second": 1.074, "step": 39 }, { "epoch": 5.637450199203188, "grad_norm": 0.01931442692875862, "learning_rate": 2.3131747660339394e-07, "loss": 1.9598, "mean_token_accuracy": 0.5539739523082972, "step": 40 }, { "epoch": 5.637450199203188, "eval_loss": 1.8909591436386108, "eval_mean_token_accuracy": 0.565919687718521, "eval_runtime": 73.502, "eval_samples_per_second": 2.204, "eval_steps_per_second": 1.102, "step": 40 }, { "epoch": 5.764940239043825, "grad_norm": 0.01860933192074299, "learning_rate": 2.1891407383812878e-07, "loss": 1.9376, "mean_token_accuracy": 0.5563419912941754, "step": 41 }, { "epoch": 5.764940239043825, "eval_loss": 1.8880671262741089, "eval_mean_token_accuracy": 0.5663069490297341, "eval_runtime": 73.4713, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 41 }, { "epoch": 5.892430278884462, "grad_norm": 0.018627656623721123, "learning_rate": 2.065879555832674e-07, "loss": 1.9507, "mean_token_accuracy": 0.5557947768829763, "step": 42 }, { "epoch": 5.892430278884462, "eval_loss": 1.8855071067810059, "eval_mean_token_accuracy": 0.5669161636888245, "eval_runtime": 73.6398, "eval_samples_per_second": 2.2, "eval_steps_per_second": 1.1, "step": 42 }, { "epoch": 6.127490039840637, "grad_norm": 0.018258944153785706, "learning_rate": 1.9436976651092142e-07, "loss": 3.7981, "mean_token_accuracy": 0.5612916554565783, "step": 43 }, { "epoch": 6.127490039840637, "eval_loss": 1.8808256387710571, "eval_mean_token_accuracy": 0.5676413597884001, "eval_runtime": 73.474, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 43 }, { "epoch": 6.254980079681275, "grad_norm": 0.017290310934185982, "learning_rate": 1.8228988296424875e-07, "loss": 1.9237, "mean_token_accuracy": 0.5610313005745411, "step": 44 }, { "epoch": 6.254980079681275, "eval_loss": 1.878740668296814, "eval_mean_token_accuracy": 0.568008154630661, "eval_runtime": 74.386, "eval_samples_per_second": 2.178, "eval_steps_per_second": 1.089, "step": 44 }, { "epoch": 6.382470119521912, "grad_norm": 0.016761859878897667, "learning_rate": 1.7037833743707892e-07, "loss": 1.9369, "mean_token_accuracy": 0.5580867524258792, "step": 45 }, { "epoch": 6.382470119521912, "eval_loss": 1.876879334449768, "eval_mean_token_accuracy": 0.5681438997939781, "eval_runtime": 73.6051, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.1, "step": 45 }, { "epoch": 6.509960159362549, "grad_norm": 0.016953120008111, "learning_rate": 1.5866474390840124e-07, "loss": 1.928, "mean_token_accuracy": 0.5594328213483095, "step": 46 }, { "epoch": 6.509960159362549, "eval_loss": 1.875379204750061, "eval_mean_token_accuracy": 0.568663157062766, "eval_runtime": 73.4833, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 46 }, { "epoch": 6.637450199203188, "grad_norm": 0.01671259105205536, "learning_rate": 1.4717822421734716e-07, "loss": 1.9103, "mean_token_accuracy": 0.5598705592565238, "step": 47 }, { "epoch": 6.637450199203188, "eval_loss": 1.8738114833831787, "eval_mean_token_accuracy": 0.5690526322082237, "eval_runtime": 73.6586, "eval_samples_per_second": 2.199, "eval_steps_per_second": 1.1, "step": 47 }, { "epoch": 6.764940239043825, "grad_norm": 0.016694391146302223, "learning_rate": 1.3594733566170925e-07, "loss": 1.9045, "mean_token_accuracy": 0.5642699301242828, "step": 48 }, { "epoch": 6.764940239043825, "eval_loss": 1.8724164962768555, "eval_mean_token_accuracy": 0.5691916114754147, "eval_runtime": 73.4747, "eval_samples_per_second": 2.205, "eval_steps_per_second": 1.102, "step": 48 }, { "epoch": 6.764940239043825, "eval_loss": 1.8724164962768555, "eval_mean_token_accuracy": 0.5691916114754147, "eval_runtime": 76.7914, "eval_samples_per_second": 2.11, "eval_steps_per_second": 1.055, "step": 48 }, { "epoch": 6.892430278884462, "grad_norm": 0.01694813370704651, "learning_rate": 1.2500000000000005e-07, "loss": 1.9085, "mean_token_accuracy": 0.5642137033864856, "step": 49 }, { "epoch": 6.892430278884462, "eval_loss": 1.8711612224578857, "eval_mean_token_accuracy": 0.5693035935178216, "eval_runtime": 74.6515, "eval_samples_per_second": 2.17, "eval_steps_per_second": 1.085, "step": 49 }, { "epoch": 7.127490039840637, "grad_norm": 0.019315047189593315, "learning_rate": 1.1436343403356016e-07, "loss": 3.8369, "mean_token_accuracy": 0.5600919233540357, "step": 50 }, { "epoch": 7.127490039840637, "eval_loss": 1.8696496486663818, "eval_mean_token_accuracy": 0.5695443061398872, "eval_runtime": 74.8597, "eval_samples_per_second": 2.164, "eval_steps_per_second": 1.082, "step": 50 }, { "epoch": 7.254980079681275, "grad_norm": 0.01633988693356514, "learning_rate": 1.0406408194130259e-07, "loss": 1.925, "mean_token_accuracy": 0.5624035471118987, "step": 51 }, { "epoch": 7.254980079681275, "eval_loss": 1.868504524230957, "eval_mean_token_accuracy": 0.5697018382725892, "eval_runtime": 73.6447, "eval_samples_per_second": 2.2, "eval_steps_per_second": 1.1, "step": 51 }, { "epoch": 7.382470119521912, "grad_norm": 0.016421355307102203, "learning_rate": 9.412754953531663e-08, "loss": 1.8986, "mean_token_accuracy": 0.5638773068785667, "step": 52 }, { "epoch": 7.382470119521912, "eval_loss": 1.8671343326568604, "eval_mean_token_accuracy": 0.569956491390864, "eval_runtime": 73.5624, "eval_samples_per_second": 2.202, "eval_steps_per_second": 1.101, "step": 52 }, { "epoch": 7.509960159362549, "grad_norm": 0.01661074347794056, "learning_rate": 8.45785406007852e-08, "loss": 1.9191, "mean_token_accuracy": 0.5608826824463904, "step": 53 }, { "epoch": 7.509960159362549, "eval_loss": 1.8656820058822632, "eval_mean_token_accuracy": 0.570398888838144, "eval_runtime": 73.6875, "eval_samples_per_second": 2.198, "eval_steps_per_second": 1.099, "step": 53 }, { "epoch": 7.637450199203188, "grad_norm": 0.015864234417676926, "learning_rate": 7.544079547848181e-08, "loss": 1.8794, "mean_token_accuracy": 0.5663872314617038, "step": 54 }, { "epoch": 7.637450199203188, "eval_loss": 1.864436149597168, "eval_mean_token_accuracy": 0.5704377036035797, "eval_runtime": 73.6017, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 54 }, { "epoch": 7.764940239043825, "grad_norm": 0.015328350476920605, "learning_rate": 6.673703204254347e-08, "loss": 1.8987, "mean_token_accuracy": 0.5637467112392187, "step": 55 }, { "epoch": 7.764940239043825, "eval_loss": 1.863165259361267, "eval_mean_token_accuracy": 0.5710033450597598, "eval_runtime": 73.6975, "eval_samples_per_second": 2.198, "eval_steps_per_second": 1.099, "step": 55 }, { "epoch": 7.892430278884462, "grad_norm": 0.015002720057964325, "learning_rate": 5.848888922025552e-08, "loss": 1.9076, "mean_token_accuracy": 0.5629196567460895, "step": 56 }, { "epoch": 7.892430278884462, "eval_loss": 1.8618277311325073, "eval_mean_token_accuracy": 0.5706999463799559, "eval_runtime": 73.5823, "eval_samples_per_second": 2.202, "eval_steps_per_second": 1.101, "step": 56 }, { "epoch": 8.127490039840637, "grad_norm": 0.018797775730490685, "learning_rate": 5.0716873194269454e-08, "loss": 3.7573, "mean_token_accuracy": 0.5670389268133376, "step": 57 }, { "epoch": 8.127490039840637, "eval_loss": 1.8594869375228882, "eval_mean_token_accuracy": 0.5713589662387047, "eval_runtime": 73.7445, "eval_samples_per_second": 2.197, "eval_steps_per_second": 1.098, "step": 57 }, { "epoch": 8.254980079681275, "grad_norm": 0.01507323607802391, "learning_rate": 4.3440306421001324e-08, "loss": 1.9055, "mean_token_accuracy": 0.5629012044519186, "step": 58 }, { "epoch": 8.254980079681275, "eval_loss": 1.8582857847213745, "eval_mean_token_accuracy": 0.5716390271245697, "eval_runtime": 73.5611, "eval_samples_per_second": 2.202, "eval_steps_per_second": 1.101, "step": 58 }, { "epoch": 8.382470119521912, "grad_norm": 0.01485736295580864, "learning_rate": 3.6677279591961096e-08, "loss": 1.9169, "mean_token_accuracy": 0.5630733277648687, "step": 59 }, { "epoch": 8.382470119521912, "eval_loss": 1.8571231365203857, "eval_mean_token_accuracy": 0.5717564843319081, "eval_runtime": 73.6663, "eval_samples_per_second": 2.199, "eval_steps_per_second": 1.1, "step": 59 }, { "epoch": 8.50996015936255, "grad_norm": 0.014736738055944443, "learning_rate": 3.044460665744283e-08, "loss": 1.9231, "mean_token_accuracy": 0.5624960637651384, "step": 60 }, { "epoch": 8.50996015936255, "eval_loss": 1.856133222579956, "eval_mean_token_accuracy": 0.5721180277106203, "eval_runtime": 73.5844, "eval_samples_per_second": 2.202, "eval_steps_per_second": 1.101, "step": 60 }, { "epoch": 8.637450199203187, "grad_norm": 0.014911642298102379, "learning_rate": 2.475778302439524e-08, "loss": 1.884, "mean_token_accuracy": 0.5678986022248864, "step": 61 }, { "epoch": 8.637450199203187, "eval_loss": 1.8549680709838867, "eval_mean_token_accuracy": 0.572380311695146, "eval_runtime": 73.7876, "eval_samples_per_second": 2.195, "eval_steps_per_second": 1.098, "step": 61 }, { "epoch": 8.764940239043824, "grad_norm": 0.014645237475633621, "learning_rate": 1.9630947032398066e-08, "loss": 1.8677, "mean_token_accuracy": 0.5674805343151093, "step": 62 }, { "epoch": 8.764940239043824, "eval_loss": 1.853942632675171, "eval_mean_token_accuracy": 0.5725214223802826, "eval_runtime": 73.6069, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.1, "step": 62 }, { "epoch": 8.892430278884461, "grad_norm": 0.014571174047887325, "learning_rate": 1.507684480352292e-08, "loss": 1.874, "mean_token_accuracy": 0.5680479519069195, "step": 63 }, { "epoch": 8.892430278884461, "eval_loss": 1.8529284000396729, "eval_mean_token_accuracy": 0.5726871490478516, "eval_runtime": 73.6953, "eval_samples_per_second": 2.198, "eval_steps_per_second": 1.099, "step": 63 }, { "epoch": 9.127490039840637, "grad_norm": 0.015872426331043243, "learning_rate": 1.1106798553464802e-08, "loss": 3.7625, "mean_token_accuracy": 0.5676577979767764, "step": 64 }, { "epoch": 9.127490039840637, "eval_loss": 1.8509719371795654, "eval_mean_token_accuracy": 0.5735157551588835, "eval_runtime": 74.4195, "eval_samples_per_second": 2.177, "eval_steps_per_second": 1.088, "step": 64 }, { "epoch": 9.254980079681275, "grad_norm": 0.014235267415642738, "learning_rate": 7.730678442730537e-09, "loss": 1.8657, "mean_token_accuracy": 0.5699905268847942, "step": 65 }, { "epoch": 9.254980079681275, "eval_loss": 1.8500498533248901, "eval_mean_token_accuracy": 0.5735325290832991, "eval_runtime": 73.6643, "eval_samples_per_second": 2.199, "eval_steps_per_second": 1.1, "step": 65 }, { "epoch": 9.382470119521912, "grad_norm": 0.013952240347862244, "learning_rate": 4.956878037864043e-09, "loss": 1.8762, "mean_token_accuracy": 0.5678728865459561, "step": 66 }, { "epoch": 9.382470119521912, "eval_loss": 1.8491297960281372, "eval_mean_token_accuracy": 0.5740439523885279, "eval_runtime": 73.7795, "eval_samples_per_second": 2.196, "eval_steps_per_second": 1.098, "step": 66 }, { "epoch": 9.50996015936255, "grad_norm": 0.014303016476333141, "learning_rate": 2.7922934437178692e-09, "loss": 1.8849, "mean_token_accuracy": 0.568179058842361, "step": 67 }, { "epoch": 9.50996015936255, "eval_loss": 1.8481377363204956, "eval_mean_token_accuracy": 0.5739904796635663, "eval_runtime": 73.6978, "eval_samples_per_second": 2.198, "eval_steps_per_second": 1.099, "step": 67 }, { "epoch": 9.637450199203187, "grad_norm": 0.013575814664363861, "learning_rate": 1.2423061586496476e-09, "loss": 1.8894, "mean_token_accuracy": 0.5682620890438557, "step": 68 }, { "epoch": 9.637450199203187, "eval_loss": 1.8472508192062378, "eval_mean_token_accuracy": 0.5742849891568408, "eval_runtime": 73.587, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 68 }, { "epoch": 9.764940239043824, "grad_norm": 0.013703697361052036, "learning_rate": 3.107696952694139e-10, "loss": 1.8847, "mean_token_accuracy": 0.5685957716777921, "step": 69 }, { "epoch": 9.764940239043824, "eval_loss": 1.8465715646743774, "eval_mean_token_accuracy": 0.5745357466332707, "eval_runtime": 73.8221, "eval_samples_per_second": 2.194, "eval_steps_per_second": 1.097, "step": 69 }, { "epoch": 9.892430278884461, "grad_norm": 0.013962024822831154, "learning_rate": 0.0, "loss": 1.9447, "mean_token_accuracy": 0.5595274027436972, "step": 70 }, { "epoch": 9.892430278884461, "eval_loss": 1.8457005023956299, "eval_mean_token_accuracy": 0.574767052391429, "eval_runtime": 73.5878, "eval_samples_per_second": 2.201, "eval_steps_per_second": 1.101, "step": 70 }, { "epoch": 9.892430278884461, "eval_loss": 1.8457005023956299, "eval_mean_token_accuracy": 0.574767052391429, "eval_runtime": 76.109, "eval_samples_per_second": 2.129, "eval_steps_per_second": 1.064, "step": 70 }, { "epoch": 9.892430278884461, "step": 70, "total_flos": 3.296975316278313e+18, "train_loss": 0.0, "train_runtime": 210.1362, "train_samples_per_second": 47.779, "train_steps_per_second": 0.333 } ], "logging_steps": 1, "max_steps": 70, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.296975316278313e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }