{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05864242779651078, "eval_steps": 400, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014660606949127694, "grad_norm": 3.455883741378784, "learning_rate": 7.5e-05, "loss": 3.9321, "step": 1 }, { "epoch": 0.00029321213898255387, "grad_norm": 2.6298787593841553, "learning_rate": 7.5e-05, "loss": 3.4287, "step": 2 }, { "epoch": 0.00043981820847383083, "grad_norm": 1.4404480457305908, "learning_rate": 7.5e-05, "loss": 3.2257, "step": 3 }, { "epoch": 0.0005864242779651077, "grad_norm": 2.2678844928741455, "learning_rate": 7.5e-05, "loss": 3.3636, "step": 4 }, { "epoch": 0.0007330303474563847, "grad_norm": 2.5889241695404053, "learning_rate": 7.5e-05, "loss": 2.9323, "step": 5 }, { "epoch": 0.0008796364169476617, "grad_norm": 1.2149893045425415, "learning_rate": 7.5e-05, "loss": 2.8705, "step": 6 }, { "epoch": 0.0010262424864389385, "grad_norm": 0.9146172404289246, "learning_rate": 7.5e-05, "loss": 2.6756, "step": 7 }, { "epoch": 0.0011728485559302155, "grad_norm": 1.3598023653030396, "learning_rate": 7.5e-05, "loss": 2.8735, "step": 8 }, { "epoch": 0.0013194546254214924, "grad_norm": 1.3088024854660034, "learning_rate": 7.5e-05, "loss": 2.7283, "step": 9 }, { "epoch": 0.0014660606949127694, "grad_norm": 19.006887435913086, "learning_rate": 7.5e-05, "loss": 2.6417, "step": 10 }, { "epoch": 0.0016126667644040464, "grad_norm": 1.5915838479995728, "learning_rate": 7.5e-05, "loss": 2.7931, "step": 11 }, { "epoch": 0.0017592728338953233, "grad_norm": 4.185395240783691, "learning_rate": 7.5e-05, "loss": 2.563, "step": 12 }, { "epoch": 0.0019058789033866003, "grad_norm": 0.9938499927520752, "learning_rate": 7.5e-05, "loss": 2.7372, "step": 13 }, { "epoch": 0.002052484972877877, "grad_norm": 0.8031460046768188, "learning_rate": 7.5e-05, "loss": 2.6044, "step": 14 }, { "epoch": 0.002199091042369154, "grad_norm": 0.7620795965194702, "learning_rate": 7.5e-05, "loss": 2.5687, "step": 15 }, { "epoch": 0.002345697111860431, "grad_norm": 0.9465439915657043, "learning_rate": 7.5e-05, "loss": 2.7366, "step": 16 }, { "epoch": 0.002492303181351708, "grad_norm": 0.8416175246238708, "learning_rate": 7.5e-05, "loss": 2.741, "step": 17 }, { "epoch": 0.002638909250842985, "grad_norm": 0.7236262559890747, "learning_rate": 7.5e-05, "loss": 2.4769, "step": 18 }, { "epoch": 0.002785515320334262, "grad_norm": 0.7475934624671936, "learning_rate": 7.5e-05, "loss": 2.5892, "step": 19 }, { "epoch": 0.002932121389825539, "grad_norm": 0.700516939163208, "learning_rate": 7.5e-05, "loss": 2.63, "step": 20 }, { "epoch": 0.0030787274593168158, "grad_norm": 0.693880021572113, "learning_rate": 7.5e-05, "loss": 2.4919, "step": 21 }, { "epoch": 0.0032253335288080927, "grad_norm": 0.7560032606124878, "learning_rate": 7.5e-05, "loss": 2.5155, "step": 22 }, { "epoch": 0.0033719395982993697, "grad_norm": 0.7018181085586548, "learning_rate": 7.5e-05, "loss": 2.5791, "step": 23 }, { "epoch": 0.0035185456677906467, "grad_norm": 0.8165842890739441, "learning_rate": 7.5e-05, "loss": 2.5228, "step": 24 }, { "epoch": 0.0036651517372819236, "grad_norm": 1.1841286420822144, "learning_rate": 7.5e-05, "loss": 2.3439, "step": 25 }, { "epoch": 0.0038117578067732006, "grad_norm": 0.7815418243408203, "learning_rate": 7.5e-05, "loss": 2.3244, "step": 26 }, { "epoch": 0.003958363876264477, "grad_norm": Infinity, "learning_rate": 7.5e-05, "loss": 2.3578, "step": 27 }, { "epoch": 0.004104969945755754, "grad_norm": 0.7782788276672363, "learning_rate": 7.5e-05, "loss": 2.2824, "step": 28 }, { "epoch": 0.004251576015247031, "grad_norm": 0.8368614315986633, "learning_rate": 7.5e-05, "loss": 2.5229, "step": 29 }, { "epoch": 0.004398182084738308, "grad_norm": 0.6284268498420715, "learning_rate": 7.5e-05, "loss": 2.5119, "step": 30 }, { "epoch": 0.004544788154229585, "grad_norm": 0.6348438858985901, "learning_rate": 7.5e-05, "loss": 2.3244, "step": 31 }, { "epoch": 0.004691394223720862, "grad_norm": 0.5359944105148315, "learning_rate": 7.5e-05, "loss": 2.3711, "step": 32 }, { "epoch": 0.004838000293212139, "grad_norm": 0.7504050135612488, "learning_rate": 7.5e-05, "loss": 2.2577, "step": 33 }, { "epoch": 0.004984606362703416, "grad_norm": 0.6479385495185852, "learning_rate": 7.5e-05, "loss": 2.3205, "step": 34 }, { "epoch": 0.005131212432194693, "grad_norm": 0.5695769786834717, "learning_rate": 7.5e-05, "loss": 2.3126, "step": 35 }, { "epoch": 0.00527781850168597, "grad_norm": 0.4831556975841522, "learning_rate": 7.5e-05, "loss": 2.2259, "step": 36 }, { "epoch": 0.005424424571177247, "grad_norm": 0.48921290040016174, "learning_rate": 7.5e-05, "loss": 2.3142, "step": 37 }, { "epoch": 0.005571030640668524, "grad_norm": 0.4701555669307709, "learning_rate": 7.5e-05, "loss": 2.3066, "step": 38 }, { "epoch": 0.005717636710159801, "grad_norm": 0.5667604207992554, "learning_rate": 7.5e-05, "loss": 2.2539, "step": 39 }, { "epoch": 0.005864242779651078, "grad_norm": 0.5447884798049927, "learning_rate": 7.5e-05, "loss": 2.3533, "step": 40 }, { "epoch": 0.006010848849142355, "grad_norm": 0.5081149935722351, "learning_rate": 7.5e-05, "loss": 2.3421, "step": 41 }, { "epoch": 0.0061574549186336315, "grad_norm": 0.47933027148246765, "learning_rate": 7.5e-05, "loss": 2.3791, "step": 42 }, { "epoch": 0.0063040609881249085, "grad_norm": 0.46539342403411865, "learning_rate": 7.5e-05, "loss": 2.2347, "step": 43 }, { "epoch": 0.0064506670576161855, "grad_norm": 0.6082663536071777, "learning_rate": 7.5e-05, "loss": 2.173, "step": 44 }, { "epoch": 0.006597273127107462, "grad_norm": 0.576062023639679, "learning_rate": 7.5e-05, "loss": 2.2916, "step": 45 }, { "epoch": 0.006743879196598739, "grad_norm": 0.4593515992164612, "learning_rate": 7.5e-05, "loss": 2.3169, "step": 46 }, { "epoch": 0.006890485266090016, "grad_norm": 0.4574475586414337, "learning_rate": 7.5e-05, "loss": 2.2583, "step": 47 }, { "epoch": 0.007037091335581293, "grad_norm": 0.5164448022842407, "learning_rate": 7.5e-05, "loss": 2.3349, "step": 48 }, { "epoch": 0.00718369740507257, "grad_norm": 0.4695877134799957, "learning_rate": 7.5e-05, "loss": 2.255, "step": 49 }, { "epoch": 0.007330303474563847, "grad_norm": 0.4581631124019623, "learning_rate": 7.5e-05, "loss": 2.1964, "step": 50 }, { "epoch": 0.007476909544055124, "grad_norm": 0.45299574732780457, "learning_rate": 7.5e-05, "loss": 2.4647, "step": 51 }, { "epoch": 0.007623515613546401, "grad_norm": 0.48451122641563416, "learning_rate": 7.5e-05, "loss": 2.2066, "step": 52 }, { "epoch": 0.007770121683037678, "grad_norm": 0.5913203358650208, "learning_rate": 7.5e-05, "loss": 2.0961, "step": 53 }, { "epoch": 0.007916727752528954, "grad_norm": 0.42326223850250244, "learning_rate": 7.5e-05, "loss": 2.2117, "step": 54 }, { "epoch": 0.008063333822020232, "grad_norm": 0.4510177969932556, "learning_rate": 7.5e-05, "loss": 2.3412, "step": 55 }, { "epoch": 0.008209939891511508, "grad_norm": 0.44723281264305115, "learning_rate": 7.5e-05, "loss": 2.2027, "step": 56 }, { "epoch": 0.008356545961002786, "grad_norm": 0.5632525086402893, "learning_rate": 7.5e-05, "loss": 2.2667, "step": 57 }, { "epoch": 0.008503152030494062, "grad_norm": 0.4612903594970703, "learning_rate": 7.5e-05, "loss": 2.2747, "step": 58 }, { "epoch": 0.00864975809998534, "grad_norm": 0.4696878492832184, "learning_rate": 7.5e-05, "loss": 2.135, "step": 59 }, { "epoch": 0.008796364169476616, "grad_norm": 0.4575818181037903, "learning_rate": 7.5e-05, "loss": 2.2351, "step": 60 }, { "epoch": 0.008942970238967894, "grad_norm": 0.4939045011997223, "learning_rate": 7.5e-05, "loss": 2.1285, "step": 61 }, { "epoch": 0.00908957630845917, "grad_norm": 0.46583932638168335, "learning_rate": 7.5e-05, "loss": 2.2665, "step": 62 }, { "epoch": 0.009236182377950448, "grad_norm": 0.4681698679924011, "learning_rate": 7.5e-05, "loss": 2.1546, "step": 63 }, { "epoch": 0.009382788447441724, "grad_norm": 0.48843592405319214, "learning_rate": 7.5e-05, "loss": 2.0642, "step": 64 }, { "epoch": 0.009529394516933002, "grad_norm": 0.4695863425731659, "learning_rate": 7.5e-05, "loss": 2.2058, "step": 65 }, { "epoch": 0.009676000586424278, "grad_norm": 0.44356605410575867, "learning_rate": 7.5e-05, "loss": 2.0865, "step": 66 }, { "epoch": 0.009822606655915556, "grad_norm": 0.5469496250152588, "learning_rate": 7.5e-05, "loss": 2.2392, "step": 67 }, { "epoch": 0.009969212725406832, "grad_norm": 0.5499363541603088, "learning_rate": 7.5e-05, "loss": 2.1105, "step": 68 }, { "epoch": 0.01011581879489811, "grad_norm": 0.46040916442871094, "learning_rate": 7.5e-05, "loss": 2.1991, "step": 69 }, { "epoch": 0.010262424864389386, "grad_norm": 0.4418318271636963, "learning_rate": 7.5e-05, "loss": 2.1375, "step": 70 }, { "epoch": 0.010409030933880663, "grad_norm": 0.5090643763542175, "learning_rate": 7.5e-05, "loss": 2.0043, "step": 71 }, { "epoch": 0.01055563700337194, "grad_norm": 0.4520326256752014, "learning_rate": 7.5e-05, "loss": 2.1635, "step": 72 }, { "epoch": 0.010702243072863217, "grad_norm": 0.4801967740058899, "learning_rate": 7.5e-05, "loss": 2.236, "step": 73 }, { "epoch": 0.010848849142354493, "grad_norm": 0.4344942271709442, "learning_rate": 7.5e-05, "loss": 2.3342, "step": 74 }, { "epoch": 0.01099545521184577, "grad_norm": 0.49545183777809143, "learning_rate": 7.5e-05, "loss": 2.2978, "step": 75 }, { "epoch": 0.011142061281337047, "grad_norm": 0.47338277101516724, "learning_rate": 7.5e-05, "loss": 2.1702, "step": 76 }, { "epoch": 0.011288667350828323, "grad_norm": 0.4275517463684082, "learning_rate": 7.5e-05, "loss": 2.113, "step": 77 }, { "epoch": 0.011435273420319601, "grad_norm": 0.43745824694633484, "learning_rate": 7.5e-05, "loss": 2.1104, "step": 78 }, { "epoch": 0.011581879489810877, "grad_norm": 0.45452436804771423, "learning_rate": 7.5e-05, "loss": 2.2726, "step": 79 }, { "epoch": 0.011728485559302155, "grad_norm": 0.45246171951293945, "learning_rate": 7.5e-05, "loss": 2.2536, "step": 80 }, { "epoch": 0.011875091628793431, "grad_norm": 0.4320465326309204, "learning_rate": 7.5e-05, "loss": 2.061, "step": 81 }, { "epoch": 0.01202169769828471, "grad_norm": 0.44274935126304626, "learning_rate": 7.5e-05, "loss": 2.1805, "step": 82 }, { "epoch": 0.012168303767775985, "grad_norm": 0.4519144296646118, "learning_rate": 7.5e-05, "loss": 2.203, "step": 83 }, { "epoch": 0.012314909837267263, "grad_norm": 0.42536455392837524, "learning_rate": 7.5e-05, "loss": 2.2029, "step": 84 }, { "epoch": 0.01246151590675854, "grad_norm": 0.5235106945037842, "learning_rate": 7.5e-05, "loss": 2.1265, "step": 85 }, { "epoch": 0.012608121976249817, "grad_norm": 0.4383241534233093, "learning_rate": 7.5e-05, "loss": 2.1051, "step": 86 }, { "epoch": 0.012754728045741093, "grad_norm": 0.503972589969635, "learning_rate": 7.5e-05, "loss": 1.9945, "step": 87 }, { "epoch": 0.012901334115232371, "grad_norm": 0.4551929533481598, "learning_rate": 7.5e-05, "loss": 2.3127, "step": 88 }, { "epoch": 0.013047940184723647, "grad_norm": 0.44864019751548767, "learning_rate": 7.5e-05, "loss": 2.1633, "step": 89 }, { "epoch": 0.013194546254214925, "grad_norm": 0.4248664975166321, "learning_rate": 7.5e-05, "loss": 2.0703, "step": 90 }, { "epoch": 0.013341152323706201, "grad_norm": 0.45344144105911255, "learning_rate": 7.5e-05, "loss": 2.0783, "step": 91 }, { "epoch": 0.013487758393197479, "grad_norm": 0.4539313018321991, "learning_rate": 7.5e-05, "loss": 2.145, "step": 92 }, { "epoch": 0.013634364462688755, "grad_norm": 0.44496864080429077, "learning_rate": 7.5e-05, "loss": 2.2827, "step": 93 }, { "epoch": 0.013780970532180033, "grad_norm": 0.4297681450843811, "learning_rate": 7.5e-05, "loss": 2.0141, "step": 94 }, { "epoch": 0.013927576601671309, "grad_norm": 0.44067010283470154, "learning_rate": 7.5e-05, "loss": 1.9025, "step": 95 }, { "epoch": 0.014074182671162587, "grad_norm": 0.4484771192073822, "learning_rate": 7.5e-05, "loss": 2.1828, "step": 96 }, { "epoch": 0.014220788740653863, "grad_norm": 0.5087531805038452, "learning_rate": 7.5e-05, "loss": 2.1164, "step": 97 }, { "epoch": 0.01436739481014514, "grad_norm": 0.4551761746406555, "learning_rate": 7.5e-05, "loss": 2.1558, "step": 98 }, { "epoch": 0.014514000879636417, "grad_norm": 0.4408041834831238, "learning_rate": 7.5e-05, "loss": 1.8609, "step": 99 }, { "epoch": 0.014660606949127694, "grad_norm": 0.42732521891593933, "learning_rate": 7.5e-05, "loss": 2.2808, "step": 100 }, { "epoch": 0.01480721301861897, "grad_norm": 0.4177221953868866, "learning_rate": 7.5e-05, "loss": 2.1713, "step": 101 }, { "epoch": 0.014953819088110248, "grad_norm": 0.4426022171974182, "learning_rate": 7.5e-05, "loss": 2.1225, "step": 102 }, { "epoch": 0.015100425157601524, "grad_norm": 0.43486642837524414, "learning_rate": 7.5e-05, "loss": 2.1107, "step": 103 }, { "epoch": 0.015247031227092802, "grad_norm": 0.45986655354499817, "learning_rate": 7.5e-05, "loss": 2.1518, "step": 104 }, { "epoch": 0.015393637296584078, "grad_norm": 0.4171789884567261, "learning_rate": 7.5e-05, "loss": 2.1841, "step": 105 }, { "epoch": 0.015540243366075356, "grad_norm": 0.5831061005592346, "learning_rate": 7.5e-05, "loss": 2.0889, "step": 106 }, { "epoch": 0.015686849435566634, "grad_norm": 0.4704948365688324, "learning_rate": 7.5e-05, "loss": 2.1551, "step": 107 }, { "epoch": 0.01583345550505791, "grad_norm": 0.4458884596824646, "learning_rate": 7.5e-05, "loss": 2.0577, "step": 108 }, { "epoch": 0.015980061574549186, "grad_norm": 0.4344092011451721, "learning_rate": 7.5e-05, "loss": 1.9942, "step": 109 }, { "epoch": 0.016126667644040464, "grad_norm": 0.4264085292816162, "learning_rate": 7.5e-05, "loss": 2.108, "step": 110 }, { "epoch": 0.016273273713531742, "grad_norm": 0.4329453408718109, "learning_rate": 7.5e-05, "loss": 2.1457, "step": 111 }, { "epoch": 0.016419879783023016, "grad_norm": 0.4105164408683777, "learning_rate": 7.5e-05, "loss": 2.1422, "step": 112 }, { "epoch": 0.016566485852514294, "grad_norm": 0.4907954931259155, "learning_rate": 7.5e-05, "loss": 2.0935, "step": 113 }, { "epoch": 0.016713091922005572, "grad_norm": 0.4624481797218323, "learning_rate": 7.5e-05, "loss": 2.0195, "step": 114 }, { "epoch": 0.016859697991496846, "grad_norm": 0.4198748469352722, "learning_rate": 7.5e-05, "loss": 1.8611, "step": 115 }, { "epoch": 0.017006304060988124, "grad_norm": 0.40714356303215027, "learning_rate": 7.5e-05, "loss": 2.0722, "step": 116 }, { "epoch": 0.017152910130479402, "grad_norm": 0.6474002003669739, "learning_rate": 7.5e-05, "loss": 2.2016, "step": 117 }, { "epoch": 0.01729951619997068, "grad_norm": 0.4311653971672058, "learning_rate": 7.5e-05, "loss": 2.2106, "step": 118 }, { "epoch": 0.017446122269461954, "grad_norm": 0.4558602571487427, "learning_rate": 7.5e-05, "loss": 2.0271, "step": 119 }, { "epoch": 0.017592728338953232, "grad_norm": 0.451534241437912, "learning_rate": 7.5e-05, "loss": 2.1961, "step": 120 }, { "epoch": 0.01773933440844451, "grad_norm": 0.4211559593677521, "learning_rate": 7.5e-05, "loss": 2.0957, "step": 121 }, { "epoch": 0.017885940477935788, "grad_norm": 0.4452389180660248, "learning_rate": 7.5e-05, "loss": 2.0852, "step": 122 }, { "epoch": 0.018032546547427062, "grad_norm": 0.45122867822647095, "learning_rate": 7.5e-05, "loss": 2.1426, "step": 123 }, { "epoch": 0.01817915261691834, "grad_norm": 0.45043325424194336, "learning_rate": 7.5e-05, "loss": 2.0483, "step": 124 }, { "epoch": 0.018325758686409618, "grad_norm": 0.47755369544029236, "learning_rate": 7.5e-05, "loss": 1.8722, "step": 125 }, { "epoch": 0.018472364755900895, "grad_norm": 0.425269216299057, "learning_rate": 7.5e-05, "loss": 1.9746, "step": 126 }, { "epoch": 0.01861897082539217, "grad_norm": 0.4519413709640503, "learning_rate": 7.5e-05, "loss": 2.072, "step": 127 }, { "epoch": 0.018765576894883448, "grad_norm": 0.41904163360595703, "learning_rate": 7.5e-05, "loss": 1.9591, "step": 128 }, { "epoch": 0.018912182964374726, "grad_norm": 0.4255903959274292, "learning_rate": 7.5e-05, "loss": 2.0391, "step": 129 }, { "epoch": 0.019058789033866003, "grad_norm": 0.4608707129955292, "learning_rate": 7.5e-05, "loss": 2.0232, "step": 130 }, { "epoch": 0.019205395103357278, "grad_norm": 0.45330360531806946, "learning_rate": 7.5e-05, "loss": 1.9756, "step": 131 }, { "epoch": 0.019352001172848556, "grad_norm": 0.4397503137588501, "learning_rate": 7.5e-05, "loss": 2.1631, "step": 132 }, { "epoch": 0.019498607242339833, "grad_norm": 0.43476343154907227, "learning_rate": 7.5e-05, "loss": 2.1718, "step": 133 }, { "epoch": 0.01964521331183111, "grad_norm": 0.49096521735191345, "learning_rate": 7.5e-05, "loss": 2.0615, "step": 134 }, { "epoch": 0.019791819381322386, "grad_norm": 0.4564846456050873, "learning_rate": 7.5e-05, "loss": 2.2379, "step": 135 }, { "epoch": 0.019938425450813663, "grad_norm": 0.4313472807407379, "learning_rate": 7.5e-05, "loss": 2.0977, "step": 136 }, { "epoch": 0.02008503152030494, "grad_norm": 0.5502983331680298, "learning_rate": 7.5e-05, "loss": 2.1296, "step": 137 }, { "epoch": 0.02023163758979622, "grad_norm": 0.45383551716804504, "learning_rate": 7.5e-05, "loss": 1.9683, "step": 138 }, { "epoch": 0.020378243659287493, "grad_norm": 0.43010184168815613, "learning_rate": 7.5e-05, "loss": 1.9183, "step": 139 }, { "epoch": 0.02052484972877877, "grad_norm": 0.42490145564079285, "learning_rate": 7.5e-05, "loss": 1.9961, "step": 140 }, { "epoch": 0.02067145579827005, "grad_norm": 0.4795985519886017, "learning_rate": 7.5e-05, "loss": 2.0988, "step": 141 }, { "epoch": 0.020818061867761327, "grad_norm": 0.43736857175827026, "learning_rate": 7.5e-05, "loss": 1.983, "step": 142 }, { "epoch": 0.0209646679372526, "grad_norm": 0.4420798718929291, "learning_rate": 7.5e-05, "loss": 2.0671, "step": 143 }, { "epoch": 0.02111127400674388, "grad_norm": 0.42915552854537964, "learning_rate": 7.5e-05, "loss": 2.0031, "step": 144 }, { "epoch": 0.021257880076235157, "grad_norm": 0.4240019619464874, "learning_rate": 7.5e-05, "loss": 2.1657, "step": 145 }, { "epoch": 0.021404486145726435, "grad_norm": 0.46670493483543396, "learning_rate": 7.5e-05, "loss": 1.9859, "step": 146 }, { "epoch": 0.02155109221521771, "grad_norm": 0.40190228819847107, "learning_rate": 7.5e-05, "loss": 2.0518, "step": 147 }, { "epoch": 0.021697698284708987, "grad_norm": 0.40282100439071655, "learning_rate": 7.5e-05, "loss": 2.0045, "step": 148 }, { "epoch": 0.021844304354200265, "grad_norm": 0.41822972893714905, "learning_rate": 7.5e-05, "loss": 2.1193, "step": 149 }, { "epoch": 0.02199091042369154, "grad_norm": 0.47076621651649475, "learning_rate": 7.5e-05, "loss": 2.0137, "step": 150 }, { "epoch": 0.022137516493182817, "grad_norm": 0.42049044370651245, "learning_rate": 7.5e-05, "loss": 2.1477, "step": 151 }, { "epoch": 0.022284122562674095, "grad_norm": 0.42053845524787903, "learning_rate": 7.5e-05, "loss": 1.8343, "step": 152 }, { "epoch": 0.022430728632165373, "grad_norm": 0.4190051555633545, "learning_rate": 7.5e-05, "loss": 2.087, "step": 153 }, { "epoch": 0.022577334701656647, "grad_norm": 0.4545474946498871, "learning_rate": 7.5e-05, "loss": 2.2673, "step": 154 }, { "epoch": 0.022723940771147925, "grad_norm": 0.416460245847702, "learning_rate": 7.5e-05, "loss": 2.2203, "step": 155 }, { "epoch": 0.022870546840639203, "grad_norm": 0.4471845328807831, "learning_rate": 7.5e-05, "loss": 2.0032, "step": 156 }, { "epoch": 0.02301715291013048, "grad_norm": 0.3981029987335205, "learning_rate": 7.5e-05, "loss": 1.9783, "step": 157 }, { "epoch": 0.023163758979621755, "grad_norm": 0.435047447681427, "learning_rate": 7.5e-05, "loss": 2.2654, "step": 158 }, { "epoch": 0.023310365049113033, "grad_norm": 0.4003314673900604, "learning_rate": 7.5e-05, "loss": 2.001, "step": 159 }, { "epoch": 0.02345697111860431, "grad_norm": 0.5133220553398132, "learning_rate": 7.5e-05, "loss": 2.1075, "step": 160 }, { "epoch": 0.02360357718809559, "grad_norm": 0.4022431969642639, "learning_rate": 7.5e-05, "loss": 2.0632, "step": 161 }, { "epoch": 0.023750183257586863, "grad_norm": 0.39590954780578613, "learning_rate": 7.5e-05, "loss": 1.8188, "step": 162 }, { "epoch": 0.02389678932707814, "grad_norm": 0.5702373385429382, "learning_rate": 7.5e-05, "loss": 2.1217, "step": 163 }, { "epoch": 0.02404339539656942, "grad_norm": 0.41336292028427124, "learning_rate": 7.5e-05, "loss": 2.1454, "step": 164 }, { "epoch": 0.024190001466060696, "grad_norm": 0.43733078241348267, "learning_rate": 7.5e-05, "loss": 2.081, "step": 165 }, { "epoch": 0.02433660753555197, "grad_norm": 0.42934396862983704, "learning_rate": 7.5e-05, "loss": 2.1135, "step": 166 }, { "epoch": 0.02448321360504325, "grad_norm": 0.45562654733657837, "learning_rate": 7.5e-05, "loss": 2.0341, "step": 167 }, { "epoch": 0.024629819674534526, "grad_norm": 0.4046000838279724, "learning_rate": 7.5e-05, "loss": 2.0308, "step": 168 }, { "epoch": 0.024776425744025804, "grad_norm": 0.44996851682662964, "learning_rate": 7.5e-05, "loss": 1.946, "step": 169 }, { "epoch": 0.02492303181351708, "grad_norm": 0.413261741399765, "learning_rate": 7.5e-05, "loss": 1.8912, "step": 170 }, { "epoch": 0.025069637883008356, "grad_norm": 0.48396384716033936, "learning_rate": 7.5e-05, "loss": 2.1345, "step": 171 }, { "epoch": 0.025216243952499634, "grad_norm": 0.39883503317832947, "learning_rate": 7.5e-05, "loss": 1.9857, "step": 172 }, { "epoch": 0.025362850021990912, "grad_norm": 0.4413810670375824, "learning_rate": 7.5e-05, "loss": 2.2985, "step": 173 }, { "epoch": 0.025509456091482186, "grad_norm": 0.4332878589630127, "learning_rate": 7.5e-05, "loss": 2.0616, "step": 174 }, { "epoch": 0.025656062160973464, "grad_norm": 0.3980844020843506, "learning_rate": 7.5e-05, "loss": 2.0837, "step": 175 }, { "epoch": 0.025802668230464742, "grad_norm": 0.4197993576526642, "learning_rate": 7.5e-05, "loss": 1.9485, "step": 176 }, { "epoch": 0.02594927429995602, "grad_norm": 0.45530280470848083, "learning_rate": 7.5e-05, "loss": 2.0481, "step": 177 }, { "epoch": 0.026095880369447294, "grad_norm": 0.423282653093338, "learning_rate": 7.5e-05, "loss": 2.0739, "step": 178 }, { "epoch": 0.026242486438938572, "grad_norm": 0.43365371227264404, "learning_rate": 7.5e-05, "loss": 1.9752, "step": 179 }, { "epoch": 0.02638909250842985, "grad_norm": 0.43113839626312256, "learning_rate": 7.5e-05, "loss": 1.9936, "step": 180 }, { "epoch": 0.026535698577921128, "grad_norm": 0.4402036666870117, "learning_rate": 7.5e-05, "loss": 2.1459, "step": 181 }, { "epoch": 0.026682304647412402, "grad_norm": 0.39267897605895996, "learning_rate": 7.5e-05, "loss": 1.8207, "step": 182 }, { "epoch": 0.02682891071690368, "grad_norm": 0.423287034034729, "learning_rate": 7.5e-05, "loss": 1.8318, "step": 183 }, { "epoch": 0.026975516786394958, "grad_norm": 0.44994425773620605, "learning_rate": 7.5e-05, "loss": 2.214, "step": 184 }, { "epoch": 0.027122122855886235, "grad_norm": 0.43958044052124023, "learning_rate": 7.5e-05, "loss": 2.0237, "step": 185 }, { "epoch": 0.02726872892537751, "grad_norm": 0.39696070551872253, "learning_rate": 7.5e-05, "loss": 1.9279, "step": 186 }, { "epoch": 0.027415334994868788, "grad_norm": 0.4161533713340759, "learning_rate": 7.5e-05, "loss": 2.1235, "step": 187 }, { "epoch": 0.027561941064360065, "grad_norm": 0.4510859251022339, "learning_rate": 7.5e-05, "loss": 2.0662, "step": 188 }, { "epoch": 0.02770854713385134, "grad_norm": 0.4535347521305084, "learning_rate": 7.5e-05, "loss": 2.076, "step": 189 }, { "epoch": 0.027855153203342618, "grad_norm": 0.4292769432067871, "learning_rate": 7.5e-05, "loss": 2.0147, "step": 190 }, { "epoch": 0.028001759272833895, "grad_norm": 0.42307668924331665, "learning_rate": 7.5e-05, "loss": 1.9912, "step": 191 }, { "epoch": 0.028148365342325173, "grad_norm": 0.3924538791179657, "learning_rate": 7.5e-05, "loss": 2.0087, "step": 192 }, { "epoch": 0.028294971411816448, "grad_norm": 0.43547332286834717, "learning_rate": 7.5e-05, "loss": 1.9228, "step": 193 }, { "epoch": 0.028441577481307725, "grad_norm": 0.423493891954422, "learning_rate": 7.5e-05, "loss": 1.9774, "step": 194 }, { "epoch": 0.028588183550799003, "grad_norm": 0.43073371052742004, "learning_rate": 7.5e-05, "loss": 2.181, "step": 195 }, { "epoch": 0.02873478962029028, "grad_norm": 0.4409768581390381, "learning_rate": 7.5e-05, "loss": 2.0971, "step": 196 }, { "epoch": 0.028881395689781555, "grad_norm": 0.4158893823623657, "learning_rate": 7.5e-05, "loss": 2.1297, "step": 197 }, { "epoch": 0.029028001759272833, "grad_norm": 0.4301662743091583, "learning_rate": 7.5e-05, "loss": 2.2093, "step": 198 }, { "epoch": 0.02917460782876411, "grad_norm": 0.4396555721759796, "learning_rate": 7.5e-05, "loss": 2.0813, "step": 199 }, { "epoch": 0.02932121389825539, "grad_norm": 0.4347635805606842, "learning_rate": 7.5e-05, "loss": 1.9147, "step": 200 }, { "epoch": 0.029467819967746663, "grad_norm": 0.4362698793411255, "learning_rate": 7.5e-05, "loss": 1.9568, "step": 201 }, { "epoch": 0.02961442603723794, "grad_norm": 0.4269654452800751, "learning_rate": 7.5e-05, "loss": 2.0418, "step": 202 }, { "epoch": 0.02976103210672922, "grad_norm": 0.45960190892219543, "learning_rate": 7.5e-05, "loss": 1.9958, "step": 203 }, { "epoch": 0.029907638176220497, "grad_norm": 0.5343586206436157, "learning_rate": 7.5e-05, "loss": 2.0697, "step": 204 }, { "epoch": 0.03005424424571177, "grad_norm": 0.4052686393260956, "learning_rate": 7.5e-05, "loss": 2.0666, "step": 205 }, { "epoch": 0.03020085031520305, "grad_norm": 0.4730953574180603, "learning_rate": 7.5e-05, "loss": 1.9611, "step": 206 }, { "epoch": 0.030347456384694327, "grad_norm": 0.431918203830719, "learning_rate": 7.5e-05, "loss": 1.8659, "step": 207 }, { "epoch": 0.030494062454185605, "grad_norm": 0.4060167372226715, "learning_rate": 7.5e-05, "loss": 2.0872, "step": 208 }, { "epoch": 0.03064066852367688, "grad_norm": 0.4186355471611023, "learning_rate": 7.5e-05, "loss": 2.0558, "step": 209 }, { "epoch": 0.030787274593168157, "grad_norm": 0.3945868909358978, "learning_rate": 7.5e-05, "loss": 1.8467, "step": 210 }, { "epoch": 0.030933880662659435, "grad_norm": 0.4758727550506592, "learning_rate": 7.5e-05, "loss": 2.0432, "step": 211 }, { "epoch": 0.031080486732150713, "grad_norm": 0.47905564308166504, "learning_rate": 7.5e-05, "loss": 2.1271, "step": 212 }, { "epoch": 0.031227092801641987, "grad_norm": 0.4165220856666565, "learning_rate": 7.5e-05, "loss": 2.0517, "step": 213 }, { "epoch": 0.03137369887113327, "grad_norm": 0.44275492429733276, "learning_rate": 7.5e-05, "loss": 2.1021, "step": 214 }, { "epoch": 0.03152030494062454, "grad_norm": 0.42875248193740845, "learning_rate": 7.5e-05, "loss": 2.0156, "step": 215 }, { "epoch": 0.03166691101011582, "grad_norm": 0.4269632399082184, "learning_rate": 7.5e-05, "loss": 2.1472, "step": 216 }, { "epoch": 0.0318135170796071, "grad_norm": 0.4364849328994751, "learning_rate": 7.5e-05, "loss": 2.0312, "step": 217 }, { "epoch": 0.03196012314909837, "grad_norm": 0.43865081667900085, "learning_rate": 7.5e-05, "loss": 1.9423, "step": 218 }, { "epoch": 0.03210672921858965, "grad_norm": 0.40435782074928284, "learning_rate": 7.5e-05, "loss": 1.973, "step": 219 }, { "epoch": 0.03225333528808093, "grad_norm": 0.41253232955932617, "learning_rate": 7.5e-05, "loss": 2.1178, "step": 220 }, { "epoch": 0.0323999413575722, "grad_norm": 0.4282158613204956, "learning_rate": 7.5e-05, "loss": 2.0244, "step": 221 }, { "epoch": 0.032546547427063484, "grad_norm": 0.42097708582878113, "learning_rate": 7.5e-05, "loss": 1.9321, "step": 222 }, { "epoch": 0.03269315349655476, "grad_norm": 0.4702489674091339, "learning_rate": 7.5e-05, "loss": 1.9824, "step": 223 }, { "epoch": 0.03283975956604603, "grad_norm": 0.45837345719337463, "learning_rate": 7.5e-05, "loss": 2.0477, "step": 224 }, { "epoch": 0.032986365635537314, "grad_norm": 0.4391036331653595, "learning_rate": 7.5e-05, "loss": 2.0382, "step": 225 }, { "epoch": 0.03313297170502859, "grad_norm": 0.4039275050163269, "learning_rate": 7.5e-05, "loss": 2.0819, "step": 226 }, { "epoch": 0.03327957777451986, "grad_norm": 0.4229048192501068, "learning_rate": 7.5e-05, "loss": 2.0042, "step": 227 }, { "epoch": 0.033426183844011144, "grad_norm": 0.4151551127433777, "learning_rate": 7.5e-05, "loss": 1.9566, "step": 228 }, { "epoch": 0.03357278991350242, "grad_norm": 0.4261118173599243, "learning_rate": 7.5e-05, "loss": 1.868, "step": 229 }, { "epoch": 0.03371939598299369, "grad_norm": 0.5024142861366272, "learning_rate": 7.5e-05, "loss": 2.1045, "step": 230 }, { "epoch": 0.033866002052484974, "grad_norm": 0.3946031630039215, "learning_rate": 7.5e-05, "loss": 1.9538, "step": 231 }, { "epoch": 0.03401260812197625, "grad_norm": 0.43463990092277527, "learning_rate": 7.5e-05, "loss": 1.8945, "step": 232 }, { "epoch": 0.03415921419146753, "grad_norm": 0.4228929579257965, "learning_rate": 7.5e-05, "loss": 2.0934, "step": 233 }, { "epoch": 0.034305820260958804, "grad_norm": 0.41771408915519714, "learning_rate": 7.5e-05, "loss": 2.0817, "step": 234 }, { "epoch": 0.03445242633045008, "grad_norm": 0.44674211740493774, "learning_rate": 7.5e-05, "loss": 2.1362, "step": 235 }, { "epoch": 0.03459903239994136, "grad_norm": 0.425772488117218, "learning_rate": 7.5e-05, "loss": 1.985, "step": 236 }, { "epoch": 0.034745638469432634, "grad_norm": 0.4421280324459076, "learning_rate": 7.5e-05, "loss": 2.0723, "step": 237 }, { "epoch": 0.03489224453892391, "grad_norm": 0.44242528080940247, "learning_rate": 7.5e-05, "loss": 1.9576, "step": 238 }, { "epoch": 0.03503885060841519, "grad_norm": 0.4266713261604309, "learning_rate": 7.5e-05, "loss": 1.9268, "step": 239 }, { "epoch": 0.035185456677906464, "grad_norm": 0.4214801490306854, "learning_rate": 7.5e-05, "loss": 1.8983, "step": 240 }, { "epoch": 0.035332062747397745, "grad_norm": 0.4246981143951416, "learning_rate": 7.5e-05, "loss": 2.024, "step": 241 }, { "epoch": 0.03547866881688902, "grad_norm": 0.44089213013648987, "learning_rate": 7.5e-05, "loss": 2.0077, "step": 242 }, { "epoch": 0.035625274886380294, "grad_norm": 0.3933759331703186, "learning_rate": 7.5e-05, "loss": 1.9355, "step": 243 }, { "epoch": 0.035771880955871575, "grad_norm": 0.4246898889541626, "learning_rate": 7.5e-05, "loss": 2.1393, "step": 244 }, { "epoch": 0.03591848702536285, "grad_norm": 0.39876532554626465, "learning_rate": 7.5e-05, "loss": 1.8837, "step": 245 }, { "epoch": 0.036065093094854124, "grad_norm": 0.41370299458503723, "learning_rate": 7.5e-05, "loss": 2.0528, "step": 246 }, { "epoch": 0.036211699164345405, "grad_norm": 0.41514405608177185, "learning_rate": 7.5e-05, "loss": 2.0402, "step": 247 }, { "epoch": 0.03635830523383668, "grad_norm": 0.42188283801078796, "learning_rate": 7.5e-05, "loss": 2.1093, "step": 248 }, { "epoch": 0.03650491130332796, "grad_norm": 0.42034000158309937, "learning_rate": 7.5e-05, "loss": 2.0502, "step": 249 }, { "epoch": 0.036651517372819235, "grad_norm": 0.4230118989944458, "learning_rate": 7.5e-05, "loss": 2.0837, "step": 250 }, { "epoch": 0.03679812344231051, "grad_norm": 0.4217623770236969, "learning_rate": 7.5e-05, "loss": 2.0339, "step": 251 }, { "epoch": 0.03694472951180179, "grad_norm": 0.5823216438293457, "learning_rate": 7.5e-05, "loss": 1.956, "step": 252 }, { "epoch": 0.037091335581293065, "grad_norm": 0.3720916211605072, "learning_rate": 7.5e-05, "loss": 1.8557, "step": 253 }, { "epoch": 0.03723794165078434, "grad_norm": 0.42141130566596985, "learning_rate": 7.5e-05, "loss": 2.1068, "step": 254 }, { "epoch": 0.03738454772027562, "grad_norm": 0.4116886854171753, "learning_rate": 7.5e-05, "loss": 1.9371, "step": 255 }, { "epoch": 0.037531153789766895, "grad_norm": 0.4347614645957947, "learning_rate": 7.5e-05, "loss": 1.9046, "step": 256 }, { "epoch": 0.03767775985925818, "grad_norm": 0.4594493806362152, "learning_rate": 7.5e-05, "loss": 1.9013, "step": 257 }, { "epoch": 0.03782436592874945, "grad_norm": 0.42262759804725647, "learning_rate": 7.5e-05, "loss": 2.0771, "step": 258 }, { "epoch": 0.037970971998240725, "grad_norm": 0.40743646025657654, "learning_rate": 7.5e-05, "loss": 1.9495, "step": 259 }, { "epoch": 0.03811757806773201, "grad_norm": 0.43572139739990234, "learning_rate": 7.5e-05, "loss": 2.1077, "step": 260 }, { "epoch": 0.03826418413722328, "grad_norm": 0.49967458844184875, "learning_rate": 7.5e-05, "loss": 1.9283, "step": 261 }, { "epoch": 0.038410790206714555, "grad_norm": 0.3929535746574402, "learning_rate": 7.5e-05, "loss": 1.944, "step": 262 }, { "epoch": 0.03855739627620584, "grad_norm": 0.4288821220397949, "learning_rate": 7.5e-05, "loss": 2.0917, "step": 263 }, { "epoch": 0.03870400234569711, "grad_norm": 0.4342331886291504, "learning_rate": 7.5e-05, "loss": 2.007, "step": 264 }, { "epoch": 0.038850608415188385, "grad_norm": 0.4641132354736328, "learning_rate": 7.5e-05, "loss": 1.982, "step": 265 }, { "epoch": 0.03899721448467967, "grad_norm": 0.4135822653770447, "learning_rate": 7.5e-05, "loss": 1.9517, "step": 266 }, { "epoch": 0.03914382055417094, "grad_norm": 0.42108359932899475, "learning_rate": 7.5e-05, "loss": 1.8839, "step": 267 }, { "epoch": 0.03929042662366222, "grad_norm": 0.40623417496681213, "learning_rate": 7.5e-05, "loss": 1.9456, "step": 268 }, { "epoch": 0.0394370326931535, "grad_norm": 0.4070262908935547, "learning_rate": 7.5e-05, "loss": 1.8654, "step": 269 }, { "epoch": 0.03958363876264477, "grad_norm": 0.600872278213501, "learning_rate": 7.5e-05, "loss": 1.9814, "step": 270 }, { "epoch": 0.03973024483213605, "grad_norm": 0.4230038523674011, "learning_rate": 7.5e-05, "loss": 1.6925, "step": 271 }, { "epoch": 0.03987685090162733, "grad_norm": 0.4146491587162018, "learning_rate": 7.5e-05, "loss": 2.0999, "step": 272 }, { "epoch": 0.0400234569711186, "grad_norm": 0.41269031167030334, "learning_rate": 7.5e-05, "loss": 2.049, "step": 273 }, { "epoch": 0.04017006304060988, "grad_norm": 0.40278056263923645, "learning_rate": 7.5e-05, "loss": 2.1052, "step": 274 }, { "epoch": 0.04031666911010116, "grad_norm": 0.684227705001831, "learning_rate": 7.5e-05, "loss": 2.0206, "step": 275 }, { "epoch": 0.04046327517959244, "grad_norm": 0.4093787968158722, "learning_rate": 7.5e-05, "loss": 2.1087, "step": 276 }, { "epoch": 0.04060988124908371, "grad_norm": 0.4116148352622986, "learning_rate": 7.5e-05, "loss": 1.9281, "step": 277 }, { "epoch": 0.04075648731857499, "grad_norm": 0.3918842375278473, "learning_rate": 7.5e-05, "loss": 1.8629, "step": 278 }, { "epoch": 0.04090309338806627, "grad_norm": 0.4154493510723114, "learning_rate": 7.5e-05, "loss": 1.8598, "step": 279 }, { "epoch": 0.04104969945755754, "grad_norm": 0.4297260046005249, "learning_rate": 7.5e-05, "loss": 1.9969, "step": 280 }, { "epoch": 0.04119630552704882, "grad_norm": 0.4116227924823761, "learning_rate": 7.5e-05, "loss": 2.0225, "step": 281 }, { "epoch": 0.0413429115965401, "grad_norm": 0.4207918643951416, "learning_rate": 7.5e-05, "loss": 1.9757, "step": 282 }, { "epoch": 0.04148951766603137, "grad_norm": 0.4006454646587372, "learning_rate": 7.5e-05, "loss": 1.8507, "step": 283 }, { "epoch": 0.041636123735522654, "grad_norm": 0.4324852228164673, "learning_rate": 7.5e-05, "loss": 2.0767, "step": 284 }, { "epoch": 0.04178272980501393, "grad_norm": 0.42393946647644043, "learning_rate": 7.5e-05, "loss": 1.8079, "step": 285 }, { "epoch": 0.0419293358745052, "grad_norm": 0.4340204894542694, "learning_rate": 7.5e-05, "loss": 1.7858, "step": 286 }, { "epoch": 0.042075941943996484, "grad_norm": 0.43480196595191956, "learning_rate": 7.5e-05, "loss": 1.9619, "step": 287 }, { "epoch": 0.04222254801348776, "grad_norm": 0.4417128264904022, "learning_rate": 7.5e-05, "loss": 1.9286, "step": 288 }, { "epoch": 0.04236915408297903, "grad_norm": 0.43333950638771057, "learning_rate": 7.5e-05, "loss": 2.0122, "step": 289 }, { "epoch": 0.042515760152470314, "grad_norm": 0.4021371006965637, "learning_rate": 7.5e-05, "loss": 1.9364, "step": 290 }, { "epoch": 0.04266236622196159, "grad_norm": 0.44192543625831604, "learning_rate": 7.5e-05, "loss": 1.8584, "step": 291 }, { "epoch": 0.04280897229145287, "grad_norm": 0.3991456627845764, "learning_rate": 7.5e-05, "loss": 1.922, "step": 292 }, { "epoch": 0.042955578360944144, "grad_norm": 0.4471362233161926, "learning_rate": 7.5e-05, "loss": 1.9382, "step": 293 }, { "epoch": 0.04310218443043542, "grad_norm": 0.48168301582336426, "learning_rate": 7.5e-05, "loss": 1.8899, "step": 294 }, { "epoch": 0.0432487904999267, "grad_norm": 0.44566604495048523, "learning_rate": 7.5e-05, "loss": 2.0752, "step": 295 }, { "epoch": 0.043395396569417974, "grad_norm": 0.43417659401893616, "learning_rate": 7.5e-05, "loss": 1.9763, "step": 296 }, { "epoch": 0.04354200263890925, "grad_norm": 0.42004889249801636, "learning_rate": 7.5e-05, "loss": 2.0579, "step": 297 }, { "epoch": 0.04368860870840053, "grad_norm": 0.40574243664741516, "learning_rate": 7.5e-05, "loss": 1.9657, "step": 298 }, { "epoch": 0.043835214777891804, "grad_norm": 0.4325792193412781, "learning_rate": 7.5e-05, "loss": 1.8716, "step": 299 }, { "epoch": 0.04398182084738308, "grad_norm": 0.4111328423023224, "learning_rate": 7.5e-05, "loss": 1.6846, "step": 300 }, { "epoch": 0.04412842691687436, "grad_norm": 0.4152776300907135, "learning_rate": 7.5e-05, "loss": 1.9749, "step": 301 }, { "epoch": 0.044275032986365634, "grad_norm": 0.43930619955062866, "learning_rate": 7.5e-05, "loss": 2.0843, "step": 302 }, { "epoch": 0.044421639055856915, "grad_norm": 0.4187251329421997, "learning_rate": 7.5e-05, "loss": 1.9617, "step": 303 }, { "epoch": 0.04456824512534819, "grad_norm": 0.4271850287914276, "learning_rate": 7.5e-05, "loss": 1.9084, "step": 304 }, { "epoch": 0.044714851194839464, "grad_norm": 0.40085408091545105, "learning_rate": 7.5e-05, "loss": 1.9012, "step": 305 }, { "epoch": 0.044861457264330745, "grad_norm": 0.42684438824653625, "learning_rate": 7.5e-05, "loss": 1.8549, "step": 306 }, { "epoch": 0.04500806333382202, "grad_norm": 0.4060433804988861, "learning_rate": 7.5e-05, "loss": 2.0108, "step": 307 }, { "epoch": 0.045154669403313294, "grad_norm": 0.44846153259277344, "learning_rate": 7.5e-05, "loss": 1.8809, "step": 308 }, { "epoch": 0.045301275472804575, "grad_norm": 0.4075014889240265, "learning_rate": 7.5e-05, "loss": 1.9013, "step": 309 }, { "epoch": 0.04544788154229585, "grad_norm": 0.4034115970134735, "learning_rate": 7.5e-05, "loss": 1.8061, "step": 310 }, { "epoch": 0.04559448761178713, "grad_norm": 0.4186389744281769, "learning_rate": 7.5e-05, "loss": 1.9598, "step": 311 }, { "epoch": 0.045741093681278405, "grad_norm": 0.43958598375320435, "learning_rate": 7.5e-05, "loss": 2.0047, "step": 312 }, { "epoch": 0.04588769975076968, "grad_norm": 0.43313688039779663, "learning_rate": 7.5e-05, "loss": 2.0501, "step": 313 }, { "epoch": 0.04603430582026096, "grad_norm": 0.42946475744247437, "learning_rate": 7.5e-05, "loss": 1.941, "step": 314 }, { "epoch": 0.046180911889752235, "grad_norm": 0.39245185256004333, "learning_rate": 7.5e-05, "loss": 1.924, "step": 315 }, { "epoch": 0.04632751795924351, "grad_norm": 0.3905038833618164, "learning_rate": 7.5e-05, "loss": 1.8175, "step": 316 }, { "epoch": 0.04647412402873479, "grad_norm": 0.431443452835083, "learning_rate": 7.5e-05, "loss": 1.8965, "step": 317 }, { "epoch": 0.046620730098226065, "grad_norm": 0.3863249719142914, "learning_rate": 7.5e-05, "loss": 1.9611, "step": 318 }, { "epoch": 0.04676733616771735, "grad_norm": 0.403775155544281, "learning_rate": 7.5e-05, "loss": 1.8096, "step": 319 }, { "epoch": 0.04691394223720862, "grad_norm": 0.3993397355079651, "learning_rate": 7.5e-05, "loss": 2.1315, "step": 320 }, { "epoch": 0.047060548306699895, "grad_norm": 0.4370652735233307, "learning_rate": 7.5e-05, "loss": 1.8833, "step": 321 }, { "epoch": 0.04720715437619118, "grad_norm": 0.4299817681312561, "learning_rate": 7.5e-05, "loss": 1.8662, "step": 322 }, { "epoch": 0.04735376044568245, "grad_norm": 0.3996736407279968, "learning_rate": 7.5e-05, "loss": 1.7324, "step": 323 }, { "epoch": 0.047500366515173725, "grad_norm": Infinity, "learning_rate": 7.5e-05, "loss": 2.0917, "step": 324 }, { "epoch": 0.04764697258466501, "grad_norm": 0.39298534393310547, "learning_rate": 7.5e-05, "loss": 2.0445, "step": 325 }, { "epoch": 0.04779357865415628, "grad_norm": 0.4032658040523529, "learning_rate": 7.5e-05, "loss": 2.0522, "step": 326 }, { "epoch": 0.04794018472364756, "grad_norm": 0.42470312118530273, "learning_rate": 7.5e-05, "loss": 2.0347, "step": 327 }, { "epoch": 0.04808679079313884, "grad_norm": 0.4265693128108978, "learning_rate": 7.5e-05, "loss": 2.0271, "step": 328 }, { "epoch": 0.04823339686263011, "grad_norm": 0.4272522032260895, "learning_rate": 7.5e-05, "loss": 1.9173, "step": 329 }, { "epoch": 0.04838000293212139, "grad_norm": 0.4321386218070984, "learning_rate": 7.5e-05, "loss": 1.9021, "step": 330 }, { "epoch": 0.04852660900161267, "grad_norm": 0.40172508358955383, "learning_rate": 7.5e-05, "loss": 1.7721, "step": 331 }, { "epoch": 0.04867321507110394, "grad_norm": 0.4164409637451172, "learning_rate": 7.5e-05, "loss": 1.8143, "step": 332 }, { "epoch": 0.04881982114059522, "grad_norm": 0.47102516889572144, "learning_rate": 7.5e-05, "loss": 1.8947, "step": 333 }, { "epoch": 0.0489664272100865, "grad_norm": 0.41888296604156494, "learning_rate": 7.5e-05, "loss": 1.7843, "step": 334 }, { "epoch": 0.04911303327957778, "grad_norm": 0.3950113356113434, "learning_rate": 7.5e-05, "loss": 1.7458, "step": 335 }, { "epoch": 0.04925963934906905, "grad_norm": 0.41272109746932983, "learning_rate": 7.5e-05, "loss": 2.0543, "step": 336 }, { "epoch": 0.04940624541856033, "grad_norm": 0.43312135338783264, "learning_rate": 7.5e-05, "loss": 1.8529, "step": 337 }, { "epoch": 0.04955285148805161, "grad_norm": 0.423692911863327, "learning_rate": 7.5e-05, "loss": 2.0786, "step": 338 }, { "epoch": 0.04969945755754288, "grad_norm": 0.4291779696941376, "learning_rate": 7.5e-05, "loss": 1.8429, "step": 339 }, { "epoch": 0.04984606362703416, "grad_norm": 0.3997940719127655, "learning_rate": 7.5e-05, "loss": 2.0066, "step": 340 }, { "epoch": 0.04999266969652544, "grad_norm": 0.42505770921707153, "learning_rate": 7.5e-05, "loss": 1.8996, "step": 341 }, { "epoch": 0.05013927576601671, "grad_norm": 0.7379485964775085, "learning_rate": 7.5e-05, "loss": 1.8577, "step": 342 }, { "epoch": 0.05028588183550799, "grad_norm": 0.43901753425598145, "learning_rate": 7.5e-05, "loss": 1.9743, "step": 343 }, { "epoch": 0.05043248790499927, "grad_norm": 0.4152974784374237, "learning_rate": 7.5e-05, "loss": 1.8597, "step": 344 }, { "epoch": 0.05057909397449054, "grad_norm": 0.42442166805267334, "learning_rate": 7.5e-05, "loss": 1.9777, "step": 345 }, { "epoch": 0.050725700043981824, "grad_norm": 0.4314291775226593, "learning_rate": 7.5e-05, "loss": 1.9094, "step": 346 }, { "epoch": 0.0508723061134731, "grad_norm": 0.42771726846694946, "learning_rate": 7.5e-05, "loss": 2.079, "step": 347 }, { "epoch": 0.05101891218296437, "grad_norm": 0.46686893701553345, "learning_rate": 7.5e-05, "loss": 2.0029, "step": 348 }, { "epoch": 0.051165518252455654, "grad_norm": 0.4192800223827362, "learning_rate": 7.5e-05, "loss": 1.8492, "step": 349 }, { "epoch": 0.05131212432194693, "grad_norm": 0.4271966218948364, "learning_rate": 7.5e-05, "loss": 1.9028, "step": 350 }, { "epoch": 0.0514587303914382, "grad_norm": 0.3974759578704834, "learning_rate": 7.5e-05, "loss": 2.0359, "step": 351 }, { "epoch": 0.051605336460929484, "grad_norm": 0.4625302255153656, "learning_rate": 7.5e-05, "loss": 1.9221, "step": 352 }, { "epoch": 0.05175194253042076, "grad_norm": 0.46180111169815063, "learning_rate": 7.5e-05, "loss": 1.9657, "step": 353 }, { "epoch": 0.05189854859991204, "grad_norm": 0.3923051655292511, "learning_rate": 7.5e-05, "loss": 2.0206, "step": 354 }, { "epoch": 0.052045154669403314, "grad_norm": 0.38152098655700684, "learning_rate": 7.5e-05, "loss": 1.9095, "step": 355 }, { "epoch": 0.05219176073889459, "grad_norm": 0.3919423222541809, "learning_rate": 7.5e-05, "loss": 1.9029, "step": 356 }, { "epoch": 0.05233836680838587, "grad_norm": 0.42000991106033325, "learning_rate": 7.5e-05, "loss": 1.8228, "step": 357 }, { "epoch": 0.052484972877877144, "grad_norm": 0.422067254781723, "learning_rate": 7.5e-05, "loss": 1.8618, "step": 358 }, { "epoch": 0.05263157894736842, "grad_norm": 0.4018467962741852, "learning_rate": 7.5e-05, "loss": 1.7979, "step": 359 }, { "epoch": 0.0527781850168597, "grad_norm": 0.3940626084804535, "learning_rate": 7.5e-05, "loss": 1.9341, "step": 360 }, { "epoch": 0.052924791086350974, "grad_norm": 0.4218617081642151, "learning_rate": 7.5e-05, "loss": 1.931, "step": 361 }, { "epoch": 0.053071397155842255, "grad_norm": 0.39889469742774963, "learning_rate": 7.5e-05, "loss": 1.9289, "step": 362 }, { "epoch": 0.05321800322533353, "grad_norm": 0.3963461220264435, "learning_rate": 7.5e-05, "loss": 1.9849, "step": 363 }, { "epoch": 0.053364609294824804, "grad_norm": 0.40507304668426514, "learning_rate": 7.5e-05, "loss": 1.6929, "step": 364 }, { "epoch": 0.053511215364316085, "grad_norm": 0.38420289754867554, "learning_rate": 7.5e-05, "loss": 1.9159, "step": 365 }, { "epoch": 0.05365782143380736, "grad_norm": 0.46198320388793945, "learning_rate": 7.5e-05, "loss": 2.0082, "step": 366 }, { "epoch": 0.053804427503298634, "grad_norm": 0.38492777943611145, "learning_rate": 7.5e-05, "loss": 1.9033, "step": 367 }, { "epoch": 0.053951033572789915, "grad_norm": 0.4116688370704651, "learning_rate": 7.5e-05, "loss": 1.9682, "step": 368 }, { "epoch": 0.05409763964228119, "grad_norm": 0.44960904121398926, "learning_rate": 7.5e-05, "loss": 1.9626, "step": 369 }, { "epoch": 0.05424424571177247, "grad_norm": 0.39794275164604187, "learning_rate": 7.5e-05, "loss": 1.8855, "step": 370 }, { "epoch": 0.054390851781263745, "grad_norm": 0.4340926706790924, "learning_rate": 7.5e-05, "loss": 2.0028, "step": 371 }, { "epoch": 0.05453745785075502, "grad_norm": 0.39723294973373413, "learning_rate": 7.5e-05, "loss": 1.9301, "step": 372 }, { "epoch": 0.0546840639202463, "grad_norm": 0.45371681451797485, "learning_rate": 7.5e-05, "loss": 1.7457, "step": 373 }, { "epoch": 0.054830669989737575, "grad_norm": 0.5021353363990784, "learning_rate": 7.5e-05, "loss": 1.855, "step": 374 }, { "epoch": 0.05497727605922885, "grad_norm": 0.37673622369766235, "learning_rate": 7.5e-05, "loss": 1.889, "step": 375 }, { "epoch": 0.05512388212872013, "grad_norm": 0.4026258587837219, "learning_rate": 7.5e-05, "loss": 2.0549, "step": 376 }, { "epoch": 0.055270488198211405, "grad_norm": 0.41268256306648254, "learning_rate": 7.5e-05, "loss": 1.8186, "step": 377 }, { "epoch": 0.05541709426770268, "grad_norm": 0.40261054039001465, "learning_rate": 7.5e-05, "loss": 2.0186, "step": 378 }, { "epoch": 0.05556370033719396, "grad_norm": 0.412936270236969, "learning_rate": 7.5e-05, "loss": 1.9663, "step": 379 }, { "epoch": 0.055710306406685235, "grad_norm": 0.4268585741519928, "learning_rate": 7.5e-05, "loss": 1.8892, "step": 380 }, { "epoch": 0.055856912476176517, "grad_norm": 0.40223076939582825, "learning_rate": 7.5e-05, "loss": 2.0457, "step": 381 }, { "epoch": 0.05600351854566779, "grad_norm": 0.3755423128604889, "learning_rate": 7.5e-05, "loss": 1.8299, "step": 382 }, { "epoch": 0.056150124615159065, "grad_norm": 0.3995567858219147, "learning_rate": 7.5e-05, "loss": 1.9749, "step": 383 }, { "epoch": 0.056296730684650347, "grad_norm": 0.3993961215019226, "learning_rate": 7.5e-05, "loss": 2.0254, "step": 384 }, { "epoch": 0.05644333675414162, "grad_norm": 0.4198998212814331, "learning_rate": 7.5e-05, "loss": 1.9508, "step": 385 }, { "epoch": 0.056589942823632895, "grad_norm": 0.39024171233177185, "learning_rate": 7.5e-05, "loss": 1.9071, "step": 386 }, { "epoch": 0.05673654889312418, "grad_norm": 0.4029495418071747, "learning_rate": 7.5e-05, "loss": 1.8866, "step": 387 }, { "epoch": 0.05688315496261545, "grad_norm": 0.4082452356815338, "learning_rate": 7.5e-05, "loss": 2.0262, "step": 388 }, { "epoch": 0.05702976103210673, "grad_norm": 0.39697325229644775, "learning_rate": 7.5e-05, "loss": 1.8344, "step": 389 }, { "epoch": 0.05717636710159801, "grad_norm": 0.4114176332950592, "learning_rate": 7.5e-05, "loss": 2.0149, "step": 390 }, { "epoch": 0.05732297317108928, "grad_norm": 0.4068869352340698, "learning_rate": 7.5e-05, "loss": 1.9518, "step": 391 }, { "epoch": 0.05746957924058056, "grad_norm": 0.38898399472236633, "learning_rate": 7.5e-05, "loss": 1.7891, "step": 392 }, { "epoch": 0.05761618531007184, "grad_norm": 0.39212045073509216, "learning_rate": 7.5e-05, "loss": 1.9879, "step": 393 }, { "epoch": 0.05776279137956311, "grad_norm": 0.3848034143447876, "learning_rate": 7.5e-05, "loss": 1.803, "step": 394 }, { "epoch": 0.05790939744905439, "grad_norm": 0.40813377499580383, "learning_rate": 7.5e-05, "loss": 1.8522, "step": 395 }, { "epoch": 0.05805600351854567, "grad_norm": 0.41719958186149597, "learning_rate": 7.5e-05, "loss": 2.0428, "step": 396 }, { "epoch": 0.05820260958803695, "grad_norm": 0.395730197429657, "learning_rate": 7.5e-05, "loss": 1.9202, "step": 397 }, { "epoch": 0.05834921565752822, "grad_norm": 0.40946465730667114, "learning_rate": 7.5e-05, "loss": 1.752, "step": 398 }, { "epoch": 0.0584958217270195, "grad_norm": 0.40470823645591736, "learning_rate": 7.5e-05, "loss": 1.8941, "step": 399 }, { "epoch": 0.05864242779651078, "grad_norm": 0.4607287347316742, "learning_rate": 7.5e-05, "loss": 1.8073, "step": 400 }, { "epoch": 0.05864242779651078, "eval_loss": 1.9215489625930786, "eval_runtime": 41.3249, "eval_samples_per_second": 13.285, "eval_steps_per_second": 6.655, "step": 400 } ], "logging_steps": 1.0, "max_steps": 6821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.188399573827584e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }