{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11728485559302156, "eval_steps": 400, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014660606949127694, "grad_norm": 3.455883741378784, "learning_rate": 7.5e-05, "loss": 3.9321, "step": 1 }, { "epoch": 0.00029321213898255387, "grad_norm": 2.6298787593841553, "learning_rate": 7.5e-05, "loss": 3.4287, "step": 2 }, { "epoch": 0.00043981820847383083, "grad_norm": 1.4404480457305908, "learning_rate": 7.5e-05, "loss": 3.2257, "step": 3 }, { "epoch": 0.0005864242779651077, "grad_norm": 2.2678844928741455, "learning_rate": 7.5e-05, "loss": 3.3636, "step": 4 }, { "epoch": 0.0007330303474563847, "grad_norm": 2.5889241695404053, "learning_rate": 7.5e-05, "loss": 2.9323, "step": 5 }, { "epoch": 0.0008796364169476617, "grad_norm": 1.2149893045425415, "learning_rate": 7.5e-05, "loss": 2.8705, "step": 6 }, { "epoch": 0.0010262424864389385, "grad_norm": 0.9146172404289246, "learning_rate": 7.5e-05, "loss": 2.6756, "step": 7 }, { "epoch": 0.0011728485559302155, "grad_norm": 1.3598023653030396, "learning_rate": 7.5e-05, "loss": 2.8735, "step": 8 }, { "epoch": 0.0013194546254214924, "grad_norm": 1.3088024854660034, "learning_rate": 7.5e-05, "loss": 2.7283, "step": 9 }, { "epoch": 0.0014660606949127694, "grad_norm": 19.006887435913086, "learning_rate": 7.5e-05, "loss": 2.6417, "step": 10 }, { "epoch": 0.0016126667644040464, "grad_norm": 1.5915838479995728, "learning_rate": 7.5e-05, "loss": 2.7931, "step": 11 }, { "epoch": 0.0017592728338953233, "grad_norm": 4.185395240783691, "learning_rate": 7.5e-05, "loss": 2.563, "step": 12 }, { "epoch": 0.0019058789033866003, "grad_norm": 0.9938499927520752, "learning_rate": 7.5e-05, "loss": 2.7372, "step": 13 }, { "epoch": 0.002052484972877877, "grad_norm": 0.8031460046768188, "learning_rate": 7.5e-05, "loss": 2.6044, "step": 14 }, { "epoch": 0.002199091042369154, "grad_norm": 0.7620795965194702, "learning_rate": 7.5e-05, "loss": 2.5687, "step": 15 }, { "epoch": 0.002345697111860431, "grad_norm": 0.9465439915657043, "learning_rate": 7.5e-05, "loss": 2.7366, "step": 16 }, { "epoch": 0.002492303181351708, "grad_norm": 0.8416175246238708, "learning_rate": 7.5e-05, "loss": 2.741, "step": 17 }, { "epoch": 0.002638909250842985, "grad_norm": 0.7236262559890747, "learning_rate": 7.5e-05, "loss": 2.4769, "step": 18 }, { "epoch": 0.002785515320334262, "grad_norm": 0.7475934624671936, "learning_rate": 7.5e-05, "loss": 2.5892, "step": 19 }, { "epoch": 0.002932121389825539, "grad_norm": 0.700516939163208, "learning_rate": 7.5e-05, "loss": 2.63, "step": 20 }, { "epoch": 0.0030787274593168158, "grad_norm": 0.693880021572113, "learning_rate": 7.5e-05, "loss": 2.4919, "step": 21 }, { "epoch": 0.0032253335288080927, "grad_norm": 0.7560032606124878, "learning_rate": 7.5e-05, "loss": 2.5155, "step": 22 }, { "epoch": 0.0033719395982993697, "grad_norm": 0.7018181085586548, "learning_rate": 7.5e-05, "loss": 2.5791, "step": 23 }, { "epoch": 0.0035185456677906467, "grad_norm": 0.8165842890739441, "learning_rate": 7.5e-05, "loss": 2.5228, "step": 24 }, { "epoch": 0.0036651517372819236, "grad_norm": 1.1841286420822144, "learning_rate": 7.5e-05, "loss": 2.3439, "step": 25 }, { "epoch": 0.0038117578067732006, "grad_norm": 0.7815418243408203, "learning_rate": 7.5e-05, "loss": 2.3244, "step": 26 }, { "epoch": 0.003958363876264477, "grad_norm": Infinity, "learning_rate": 7.5e-05, "loss": 2.3578, "step": 27 }, { "epoch": 0.004104969945755754, "grad_norm": 0.7782788276672363, "learning_rate": 7.5e-05, "loss": 2.2824, "step": 28 }, { "epoch": 0.004251576015247031, "grad_norm": 0.8368614315986633, "learning_rate": 7.5e-05, "loss": 2.5229, "step": 29 }, { "epoch": 0.004398182084738308, "grad_norm": 0.6284268498420715, "learning_rate": 7.5e-05, "loss": 2.5119, "step": 30 }, { "epoch": 0.004544788154229585, "grad_norm": 0.6348438858985901, "learning_rate": 7.5e-05, "loss": 2.3244, "step": 31 }, { "epoch": 0.004691394223720862, "grad_norm": 0.5359944105148315, "learning_rate": 7.5e-05, "loss": 2.3711, "step": 32 }, { "epoch": 0.004838000293212139, "grad_norm": 0.7504050135612488, "learning_rate": 7.5e-05, "loss": 2.2577, "step": 33 }, { "epoch": 0.004984606362703416, "grad_norm": 0.6479385495185852, "learning_rate": 7.5e-05, "loss": 2.3205, "step": 34 }, { "epoch": 0.005131212432194693, "grad_norm": 0.5695769786834717, "learning_rate": 7.5e-05, "loss": 2.3126, "step": 35 }, { "epoch": 0.00527781850168597, "grad_norm": 0.4831556975841522, "learning_rate": 7.5e-05, "loss": 2.2259, "step": 36 }, { "epoch": 0.005424424571177247, "grad_norm": 0.48921290040016174, "learning_rate": 7.5e-05, "loss": 2.3142, "step": 37 }, { "epoch": 0.005571030640668524, "grad_norm": 0.4701555669307709, "learning_rate": 7.5e-05, "loss": 2.3066, "step": 38 }, { "epoch": 0.005717636710159801, "grad_norm": 0.5667604207992554, "learning_rate": 7.5e-05, "loss": 2.2539, "step": 39 }, { "epoch": 0.005864242779651078, "grad_norm": 0.5447884798049927, "learning_rate": 7.5e-05, "loss": 2.3533, "step": 40 }, { "epoch": 0.006010848849142355, "grad_norm": 0.5081149935722351, "learning_rate": 7.5e-05, "loss": 2.3421, "step": 41 }, { "epoch": 0.0061574549186336315, "grad_norm": 0.47933027148246765, "learning_rate": 7.5e-05, "loss": 2.3791, "step": 42 }, { "epoch": 0.0063040609881249085, "grad_norm": 0.46539342403411865, "learning_rate": 7.5e-05, "loss": 2.2347, "step": 43 }, { "epoch": 0.0064506670576161855, "grad_norm": 0.6082663536071777, "learning_rate": 7.5e-05, "loss": 2.173, "step": 44 }, { "epoch": 0.006597273127107462, "grad_norm": 0.576062023639679, "learning_rate": 7.5e-05, "loss": 2.2916, "step": 45 }, { "epoch": 0.006743879196598739, "grad_norm": 0.4593515992164612, "learning_rate": 7.5e-05, "loss": 2.3169, "step": 46 }, { "epoch": 0.006890485266090016, "grad_norm": 0.4574475586414337, "learning_rate": 7.5e-05, "loss": 2.2583, "step": 47 }, { "epoch": 0.007037091335581293, "grad_norm": 0.5164448022842407, "learning_rate": 7.5e-05, "loss": 2.3349, "step": 48 }, { "epoch": 0.00718369740507257, "grad_norm": 0.4695877134799957, "learning_rate": 7.5e-05, "loss": 2.255, "step": 49 }, { "epoch": 0.007330303474563847, "grad_norm": 0.4581631124019623, "learning_rate": 7.5e-05, "loss": 2.1964, "step": 50 }, { "epoch": 0.007476909544055124, "grad_norm": 0.45299574732780457, "learning_rate": 7.5e-05, "loss": 2.4647, "step": 51 }, { "epoch": 0.007623515613546401, "grad_norm": 0.48451122641563416, "learning_rate": 7.5e-05, "loss": 2.2066, "step": 52 }, { "epoch": 0.007770121683037678, "grad_norm": 0.5913203358650208, "learning_rate": 7.5e-05, "loss": 2.0961, "step": 53 }, { "epoch": 0.007916727752528954, "grad_norm": 0.42326223850250244, "learning_rate": 7.5e-05, "loss": 2.2117, "step": 54 }, { "epoch": 0.008063333822020232, "grad_norm": 0.4510177969932556, "learning_rate": 7.5e-05, "loss": 2.3412, "step": 55 }, { "epoch": 0.008209939891511508, "grad_norm": 0.44723281264305115, "learning_rate": 7.5e-05, "loss": 2.2027, "step": 56 }, { "epoch": 0.008356545961002786, "grad_norm": 0.5632525086402893, "learning_rate": 7.5e-05, "loss": 2.2667, "step": 57 }, { "epoch": 0.008503152030494062, "grad_norm": 0.4612903594970703, "learning_rate": 7.5e-05, "loss": 2.2747, "step": 58 }, { "epoch": 0.00864975809998534, "grad_norm": 0.4696878492832184, "learning_rate": 7.5e-05, "loss": 2.135, "step": 59 }, { "epoch": 0.008796364169476616, "grad_norm": 0.4575818181037903, "learning_rate": 7.5e-05, "loss": 2.2351, "step": 60 }, { "epoch": 0.008942970238967894, "grad_norm": 0.4939045011997223, "learning_rate": 7.5e-05, "loss": 2.1285, "step": 61 }, { "epoch": 0.00908957630845917, "grad_norm": 0.46583932638168335, "learning_rate": 7.5e-05, "loss": 2.2665, "step": 62 }, { "epoch": 0.009236182377950448, "grad_norm": 0.4681698679924011, "learning_rate": 7.5e-05, "loss": 2.1546, "step": 63 }, { "epoch": 0.009382788447441724, "grad_norm": 0.48843592405319214, "learning_rate": 7.5e-05, "loss": 2.0642, "step": 64 }, { "epoch": 0.009529394516933002, "grad_norm": 0.4695863425731659, "learning_rate": 7.5e-05, "loss": 2.2058, "step": 65 }, { "epoch": 0.009676000586424278, "grad_norm": 0.44356605410575867, "learning_rate": 7.5e-05, "loss": 2.0865, "step": 66 }, { "epoch": 0.009822606655915556, "grad_norm": 0.5469496250152588, "learning_rate": 7.5e-05, "loss": 2.2392, "step": 67 }, { "epoch": 0.009969212725406832, "grad_norm": 0.5499363541603088, "learning_rate": 7.5e-05, "loss": 2.1105, "step": 68 }, { "epoch": 0.01011581879489811, "grad_norm": 0.46040916442871094, "learning_rate": 7.5e-05, "loss": 2.1991, "step": 69 }, { "epoch": 0.010262424864389386, "grad_norm": 0.4418318271636963, "learning_rate": 7.5e-05, "loss": 2.1375, "step": 70 }, { "epoch": 0.010409030933880663, "grad_norm": 0.5090643763542175, "learning_rate": 7.5e-05, "loss": 2.0043, "step": 71 }, { "epoch": 0.01055563700337194, "grad_norm": 0.4520326256752014, "learning_rate": 7.5e-05, "loss": 2.1635, "step": 72 }, { "epoch": 0.010702243072863217, "grad_norm": 0.4801967740058899, "learning_rate": 7.5e-05, "loss": 2.236, "step": 73 }, { "epoch": 0.010848849142354493, "grad_norm": 0.4344942271709442, "learning_rate": 7.5e-05, "loss": 2.3342, "step": 74 }, { "epoch": 0.01099545521184577, "grad_norm": 0.49545183777809143, "learning_rate": 7.5e-05, "loss": 2.2978, "step": 75 }, { "epoch": 0.011142061281337047, "grad_norm": 0.47338277101516724, "learning_rate": 7.5e-05, "loss": 2.1702, "step": 76 }, { "epoch": 0.011288667350828323, "grad_norm": 0.4275517463684082, "learning_rate": 7.5e-05, "loss": 2.113, "step": 77 }, { "epoch": 0.011435273420319601, "grad_norm": 0.43745824694633484, "learning_rate": 7.5e-05, "loss": 2.1104, "step": 78 }, { "epoch": 0.011581879489810877, "grad_norm": 0.45452436804771423, "learning_rate": 7.5e-05, "loss": 2.2726, "step": 79 }, { "epoch": 0.011728485559302155, "grad_norm": 0.45246171951293945, "learning_rate": 7.5e-05, "loss": 2.2536, "step": 80 }, { "epoch": 0.011875091628793431, "grad_norm": 0.4320465326309204, "learning_rate": 7.5e-05, "loss": 2.061, "step": 81 }, { "epoch": 0.01202169769828471, "grad_norm": 0.44274935126304626, "learning_rate": 7.5e-05, "loss": 2.1805, "step": 82 }, { "epoch": 0.012168303767775985, "grad_norm": 0.4519144296646118, "learning_rate": 7.5e-05, "loss": 2.203, "step": 83 }, { "epoch": 0.012314909837267263, "grad_norm": 0.42536455392837524, "learning_rate": 7.5e-05, "loss": 2.2029, "step": 84 }, { "epoch": 0.01246151590675854, "grad_norm": 0.5235106945037842, "learning_rate": 7.5e-05, "loss": 2.1265, "step": 85 }, { "epoch": 0.012608121976249817, "grad_norm": 0.4383241534233093, "learning_rate": 7.5e-05, "loss": 2.1051, "step": 86 }, { "epoch": 0.012754728045741093, "grad_norm": 0.503972589969635, "learning_rate": 7.5e-05, "loss": 1.9945, "step": 87 }, { "epoch": 0.012901334115232371, "grad_norm": 0.4551929533481598, "learning_rate": 7.5e-05, "loss": 2.3127, "step": 88 }, { "epoch": 0.013047940184723647, "grad_norm": 0.44864019751548767, "learning_rate": 7.5e-05, "loss": 2.1633, "step": 89 }, { "epoch": 0.013194546254214925, "grad_norm": 0.4248664975166321, "learning_rate": 7.5e-05, "loss": 2.0703, "step": 90 }, { "epoch": 0.013341152323706201, "grad_norm": 0.45344144105911255, "learning_rate": 7.5e-05, "loss": 2.0783, "step": 91 }, { "epoch": 0.013487758393197479, "grad_norm": 0.4539313018321991, "learning_rate": 7.5e-05, "loss": 2.145, "step": 92 }, { "epoch": 0.013634364462688755, "grad_norm": 0.44496864080429077, "learning_rate": 7.5e-05, "loss": 2.2827, "step": 93 }, { "epoch": 0.013780970532180033, "grad_norm": 0.4297681450843811, "learning_rate": 7.5e-05, "loss": 2.0141, "step": 94 }, { "epoch": 0.013927576601671309, "grad_norm": 0.44067010283470154, "learning_rate": 7.5e-05, "loss": 1.9025, "step": 95 }, { "epoch": 0.014074182671162587, "grad_norm": 0.4484771192073822, "learning_rate": 7.5e-05, "loss": 2.1828, "step": 96 }, { "epoch": 0.014220788740653863, "grad_norm": 0.5087531805038452, "learning_rate": 7.5e-05, "loss": 2.1164, "step": 97 }, { "epoch": 0.01436739481014514, "grad_norm": 0.4551761746406555, "learning_rate": 7.5e-05, "loss": 2.1558, "step": 98 }, { "epoch": 0.014514000879636417, "grad_norm": 0.4408041834831238, "learning_rate": 7.5e-05, "loss": 1.8609, "step": 99 }, { "epoch": 0.014660606949127694, "grad_norm": 0.42732521891593933, "learning_rate": 7.5e-05, "loss": 2.2808, "step": 100 }, { "epoch": 0.01480721301861897, "grad_norm": 0.4177221953868866, "learning_rate": 7.5e-05, "loss": 2.1713, "step": 101 }, { "epoch": 0.014953819088110248, "grad_norm": 0.4426022171974182, "learning_rate": 7.5e-05, "loss": 2.1225, "step": 102 }, { "epoch": 0.015100425157601524, "grad_norm": 0.43486642837524414, "learning_rate": 7.5e-05, "loss": 2.1107, "step": 103 }, { "epoch": 0.015247031227092802, "grad_norm": 0.45986655354499817, "learning_rate": 7.5e-05, "loss": 2.1518, "step": 104 }, { "epoch": 0.015393637296584078, "grad_norm": 0.4171789884567261, "learning_rate": 7.5e-05, "loss": 2.1841, "step": 105 }, { "epoch": 0.015540243366075356, "grad_norm": 0.5831061005592346, "learning_rate": 7.5e-05, "loss": 2.0889, "step": 106 }, { "epoch": 0.015686849435566634, "grad_norm": 0.4704948365688324, "learning_rate": 7.5e-05, "loss": 2.1551, "step": 107 }, { "epoch": 0.01583345550505791, "grad_norm": 0.4458884596824646, "learning_rate": 7.5e-05, "loss": 2.0577, "step": 108 }, { "epoch": 0.015980061574549186, "grad_norm": 0.4344092011451721, "learning_rate": 7.5e-05, "loss": 1.9942, "step": 109 }, { "epoch": 0.016126667644040464, "grad_norm": 0.4264085292816162, "learning_rate": 7.5e-05, "loss": 2.108, "step": 110 }, { "epoch": 0.016273273713531742, "grad_norm": 0.4329453408718109, "learning_rate": 7.5e-05, "loss": 2.1457, "step": 111 }, { "epoch": 0.016419879783023016, "grad_norm": 0.4105164408683777, "learning_rate": 7.5e-05, "loss": 2.1422, "step": 112 }, { "epoch": 0.016566485852514294, "grad_norm": 0.4907954931259155, "learning_rate": 7.5e-05, "loss": 2.0935, "step": 113 }, { "epoch": 0.016713091922005572, "grad_norm": 0.4624481797218323, "learning_rate": 7.5e-05, "loss": 2.0195, "step": 114 }, { "epoch": 0.016859697991496846, "grad_norm": 0.4198748469352722, "learning_rate": 7.5e-05, "loss": 1.8611, "step": 115 }, { "epoch": 0.017006304060988124, "grad_norm": 0.40714356303215027, "learning_rate": 7.5e-05, "loss": 2.0722, "step": 116 }, { "epoch": 0.017152910130479402, "grad_norm": 0.6474002003669739, "learning_rate": 7.5e-05, "loss": 2.2016, "step": 117 }, { "epoch": 0.01729951619997068, "grad_norm": 0.4311653971672058, "learning_rate": 7.5e-05, "loss": 2.2106, "step": 118 }, { "epoch": 0.017446122269461954, "grad_norm": 0.4558602571487427, "learning_rate": 7.5e-05, "loss": 2.0271, "step": 119 }, { "epoch": 0.017592728338953232, "grad_norm": 0.451534241437912, "learning_rate": 7.5e-05, "loss": 2.1961, "step": 120 }, { "epoch": 0.01773933440844451, "grad_norm": 0.4211559593677521, "learning_rate": 7.5e-05, "loss": 2.0957, "step": 121 }, { "epoch": 0.017885940477935788, "grad_norm": 0.4452389180660248, "learning_rate": 7.5e-05, "loss": 2.0852, "step": 122 }, { "epoch": 0.018032546547427062, "grad_norm": 0.45122867822647095, "learning_rate": 7.5e-05, "loss": 2.1426, "step": 123 }, { "epoch": 0.01817915261691834, "grad_norm": 0.45043325424194336, "learning_rate": 7.5e-05, "loss": 2.0483, "step": 124 }, { "epoch": 0.018325758686409618, "grad_norm": 0.47755369544029236, "learning_rate": 7.5e-05, "loss": 1.8722, "step": 125 }, { "epoch": 0.018472364755900895, "grad_norm": 0.425269216299057, "learning_rate": 7.5e-05, "loss": 1.9746, "step": 126 }, { "epoch": 0.01861897082539217, "grad_norm": 0.4519413709640503, "learning_rate": 7.5e-05, "loss": 2.072, "step": 127 }, { "epoch": 0.018765576894883448, "grad_norm": 0.41904163360595703, "learning_rate": 7.5e-05, "loss": 1.9591, "step": 128 }, { "epoch": 0.018912182964374726, "grad_norm": 0.4255903959274292, "learning_rate": 7.5e-05, "loss": 2.0391, "step": 129 }, { "epoch": 0.019058789033866003, "grad_norm": 0.4608707129955292, "learning_rate": 7.5e-05, "loss": 2.0232, "step": 130 }, { "epoch": 0.019205395103357278, "grad_norm": 0.45330360531806946, "learning_rate": 7.5e-05, "loss": 1.9756, "step": 131 }, { "epoch": 0.019352001172848556, "grad_norm": 0.4397503137588501, "learning_rate": 7.5e-05, "loss": 2.1631, "step": 132 }, { "epoch": 0.019498607242339833, "grad_norm": 0.43476343154907227, "learning_rate": 7.5e-05, "loss": 2.1718, "step": 133 }, { "epoch": 0.01964521331183111, "grad_norm": 0.49096521735191345, "learning_rate": 7.5e-05, "loss": 2.0615, "step": 134 }, { "epoch": 0.019791819381322386, "grad_norm": 0.4564846456050873, "learning_rate": 7.5e-05, "loss": 2.2379, "step": 135 }, { "epoch": 0.019938425450813663, "grad_norm": 0.4313472807407379, "learning_rate": 7.5e-05, "loss": 2.0977, "step": 136 }, { "epoch": 0.02008503152030494, "grad_norm": 0.5502983331680298, "learning_rate": 7.5e-05, "loss": 2.1296, "step": 137 }, { "epoch": 0.02023163758979622, "grad_norm": 0.45383551716804504, "learning_rate": 7.5e-05, "loss": 1.9683, "step": 138 }, { "epoch": 0.020378243659287493, "grad_norm": 0.43010184168815613, "learning_rate": 7.5e-05, "loss": 1.9183, "step": 139 }, { "epoch": 0.02052484972877877, "grad_norm": 0.42490145564079285, "learning_rate": 7.5e-05, "loss": 1.9961, "step": 140 }, { "epoch": 0.02067145579827005, "grad_norm": 0.4795985519886017, "learning_rate": 7.5e-05, "loss": 2.0988, "step": 141 }, { "epoch": 0.020818061867761327, "grad_norm": 0.43736857175827026, "learning_rate": 7.5e-05, "loss": 1.983, "step": 142 }, { "epoch": 0.0209646679372526, "grad_norm": 0.4420798718929291, "learning_rate": 7.5e-05, "loss": 2.0671, "step": 143 }, { "epoch": 0.02111127400674388, "grad_norm": 0.42915552854537964, "learning_rate": 7.5e-05, "loss": 2.0031, "step": 144 }, { "epoch": 0.021257880076235157, "grad_norm": 0.4240019619464874, "learning_rate": 7.5e-05, "loss": 2.1657, "step": 145 }, { "epoch": 0.021404486145726435, "grad_norm": 0.46670493483543396, "learning_rate": 7.5e-05, "loss": 1.9859, "step": 146 }, { "epoch": 0.02155109221521771, "grad_norm": 0.40190228819847107, "learning_rate": 7.5e-05, "loss": 2.0518, "step": 147 }, { "epoch": 0.021697698284708987, "grad_norm": 0.40282100439071655, "learning_rate": 7.5e-05, "loss": 2.0045, "step": 148 }, { "epoch": 0.021844304354200265, "grad_norm": 0.41822972893714905, "learning_rate": 7.5e-05, "loss": 2.1193, "step": 149 }, { "epoch": 0.02199091042369154, "grad_norm": 0.47076621651649475, "learning_rate": 7.5e-05, "loss": 2.0137, "step": 150 }, { "epoch": 0.022137516493182817, "grad_norm": 0.42049044370651245, "learning_rate": 7.5e-05, "loss": 2.1477, "step": 151 }, { "epoch": 0.022284122562674095, "grad_norm": 0.42053845524787903, "learning_rate": 7.5e-05, "loss": 1.8343, "step": 152 }, { "epoch": 0.022430728632165373, "grad_norm": 0.4190051555633545, "learning_rate": 7.5e-05, "loss": 2.087, "step": 153 }, { "epoch": 0.022577334701656647, "grad_norm": 0.4545474946498871, "learning_rate": 7.5e-05, "loss": 2.2673, "step": 154 }, { "epoch": 0.022723940771147925, "grad_norm": 0.416460245847702, "learning_rate": 7.5e-05, "loss": 2.2203, "step": 155 }, { "epoch": 0.022870546840639203, "grad_norm": 0.4471845328807831, "learning_rate": 7.5e-05, "loss": 2.0032, "step": 156 }, { "epoch": 0.02301715291013048, "grad_norm": 0.3981029987335205, "learning_rate": 7.5e-05, "loss": 1.9783, "step": 157 }, { "epoch": 0.023163758979621755, "grad_norm": 0.435047447681427, "learning_rate": 7.5e-05, "loss": 2.2654, "step": 158 }, { "epoch": 0.023310365049113033, "grad_norm": 0.4003314673900604, "learning_rate": 7.5e-05, "loss": 2.001, "step": 159 }, { "epoch": 0.02345697111860431, "grad_norm": 0.5133220553398132, "learning_rate": 7.5e-05, "loss": 2.1075, "step": 160 }, { "epoch": 0.02360357718809559, "grad_norm": 0.4022431969642639, "learning_rate": 7.5e-05, "loss": 2.0632, "step": 161 }, { "epoch": 0.023750183257586863, "grad_norm": 0.39590954780578613, "learning_rate": 7.5e-05, "loss": 1.8188, "step": 162 }, { "epoch": 0.02389678932707814, "grad_norm": 0.5702373385429382, "learning_rate": 7.5e-05, "loss": 2.1217, "step": 163 }, { "epoch": 0.02404339539656942, "grad_norm": 0.41336292028427124, "learning_rate": 7.5e-05, "loss": 2.1454, "step": 164 }, { "epoch": 0.024190001466060696, "grad_norm": 0.43733078241348267, "learning_rate": 7.5e-05, "loss": 2.081, "step": 165 }, { "epoch": 0.02433660753555197, "grad_norm": 0.42934396862983704, "learning_rate": 7.5e-05, "loss": 2.1135, "step": 166 }, { "epoch": 0.02448321360504325, "grad_norm": 0.45562654733657837, "learning_rate": 7.5e-05, "loss": 2.0341, "step": 167 }, { "epoch": 0.024629819674534526, "grad_norm": 0.4046000838279724, "learning_rate": 7.5e-05, "loss": 2.0308, "step": 168 }, { "epoch": 0.024776425744025804, "grad_norm": 0.44996851682662964, "learning_rate": 7.5e-05, "loss": 1.946, "step": 169 }, { "epoch": 0.02492303181351708, "grad_norm": 0.413261741399765, "learning_rate": 7.5e-05, "loss": 1.8912, "step": 170 }, { "epoch": 0.025069637883008356, "grad_norm": 0.48396384716033936, "learning_rate": 7.5e-05, "loss": 2.1345, "step": 171 }, { "epoch": 0.025216243952499634, "grad_norm": 0.39883503317832947, "learning_rate": 7.5e-05, "loss": 1.9857, "step": 172 }, { "epoch": 0.025362850021990912, "grad_norm": 0.4413810670375824, "learning_rate": 7.5e-05, "loss": 2.2985, "step": 173 }, { "epoch": 0.025509456091482186, "grad_norm": 0.4332878589630127, "learning_rate": 7.5e-05, "loss": 2.0616, "step": 174 }, { "epoch": 0.025656062160973464, "grad_norm": 0.3980844020843506, "learning_rate": 7.5e-05, "loss": 2.0837, "step": 175 }, { "epoch": 0.025802668230464742, "grad_norm": 0.4197993576526642, "learning_rate": 7.5e-05, "loss": 1.9485, "step": 176 }, { "epoch": 0.02594927429995602, "grad_norm": 0.45530280470848083, "learning_rate": 7.5e-05, "loss": 2.0481, "step": 177 }, { "epoch": 0.026095880369447294, "grad_norm": 0.423282653093338, "learning_rate": 7.5e-05, "loss": 2.0739, "step": 178 }, { "epoch": 0.026242486438938572, "grad_norm": 0.43365371227264404, "learning_rate": 7.5e-05, "loss": 1.9752, "step": 179 }, { "epoch": 0.02638909250842985, "grad_norm": 0.43113839626312256, "learning_rate": 7.5e-05, "loss": 1.9936, "step": 180 }, { "epoch": 0.026535698577921128, "grad_norm": 0.4402036666870117, "learning_rate": 7.5e-05, "loss": 2.1459, "step": 181 }, { "epoch": 0.026682304647412402, "grad_norm": 0.39267897605895996, "learning_rate": 7.5e-05, "loss": 1.8207, "step": 182 }, { "epoch": 0.02682891071690368, "grad_norm": 0.423287034034729, "learning_rate": 7.5e-05, "loss": 1.8318, "step": 183 }, { "epoch": 0.026975516786394958, "grad_norm": 0.44994425773620605, "learning_rate": 7.5e-05, "loss": 2.214, "step": 184 }, { "epoch": 0.027122122855886235, "grad_norm": 0.43958044052124023, "learning_rate": 7.5e-05, "loss": 2.0237, "step": 185 }, { "epoch": 0.02726872892537751, "grad_norm": 0.39696070551872253, "learning_rate": 7.5e-05, "loss": 1.9279, "step": 186 }, { "epoch": 0.027415334994868788, "grad_norm": 0.4161533713340759, "learning_rate": 7.5e-05, "loss": 2.1235, "step": 187 }, { "epoch": 0.027561941064360065, "grad_norm": 0.4510859251022339, "learning_rate": 7.5e-05, "loss": 2.0662, "step": 188 }, { "epoch": 0.02770854713385134, "grad_norm": 0.4535347521305084, "learning_rate": 7.5e-05, "loss": 2.076, "step": 189 }, { "epoch": 0.027855153203342618, "grad_norm": 0.4292769432067871, "learning_rate": 7.5e-05, "loss": 2.0147, "step": 190 }, { "epoch": 0.028001759272833895, "grad_norm": 0.42307668924331665, "learning_rate": 7.5e-05, "loss": 1.9912, "step": 191 }, { "epoch": 0.028148365342325173, "grad_norm": 0.3924538791179657, "learning_rate": 7.5e-05, "loss": 2.0087, "step": 192 }, { "epoch": 0.028294971411816448, "grad_norm": 0.43547332286834717, "learning_rate": 7.5e-05, "loss": 1.9228, "step": 193 }, { "epoch": 0.028441577481307725, "grad_norm": 0.423493891954422, "learning_rate": 7.5e-05, "loss": 1.9774, "step": 194 }, { "epoch": 0.028588183550799003, "grad_norm": 0.43073371052742004, "learning_rate": 7.5e-05, "loss": 2.181, "step": 195 }, { "epoch": 0.02873478962029028, "grad_norm": 0.4409768581390381, "learning_rate": 7.5e-05, "loss": 2.0971, "step": 196 }, { "epoch": 0.028881395689781555, "grad_norm": 0.4158893823623657, "learning_rate": 7.5e-05, "loss": 2.1297, "step": 197 }, { "epoch": 0.029028001759272833, "grad_norm": 0.4301662743091583, "learning_rate": 7.5e-05, "loss": 2.2093, "step": 198 }, { "epoch": 0.02917460782876411, "grad_norm": 0.4396555721759796, "learning_rate": 7.5e-05, "loss": 2.0813, "step": 199 }, { "epoch": 0.02932121389825539, "grad_norm": 0.4347635805606842, "learning_rate": 7.5e-05, "loss": 1.9147, "step": 200 }, { "epoch": 0.029467819967746663, "grad_norm": 0.4362698793411255, "learning_rate": 7.5e-05, "loss": 1.9568, "step": 201 }, { "epoch": 0.02961442603723794, "grad_norm": 0.4269654452800751, "learning_rate": 7.5e-05, "loss": 2.0418, "step": 202 }, { "epoch": 0.02976103210672922, "grad_norm": 0.45960190892219543, "learning_rate": 7.5e-05, "loss": 1.9958, "step": 203 }, { "epoch": 0.029907638176220497, "grad_norm": 0.5343586206436157, "learning_rate": 7.5e-05, "loss": 2.0697, "step": 204 }, { "epoch": 0.03005424424571177, "grad_norm": 0.4052686393260956, "learning_rate": 7.5e-05, "loss": 2.0666, "step": 205 }, { "epoch": 0.03020085031520305, "grad_norm": 0.4730953574180603, "learning_rate": 7.5e-05, "loss": 1.9611, "step": 206 }, { "epoch": 0.030347456384694327, "grad_norm": 0.431918203830719, "learning_rate": 7.5e-05, "loss": 1.8659, "step": 207 }, { "epoch": 0.030494062454185605, "grad_norm": 0.4060167372226715, "learning_rate": 7.5e-05, "loss": 2.0872, "step": 208 }, { "epoch": 0.03064066852367688, "grad_norm": 0.4186355471611023, "learning_rate": 7.5e-05, "loss": 2.0558, "step": 209 }, { "epoch": 0.030787274593168157, "grad_norm": 0.3945868909358978, "learning_rate": 7.5e-05, "loss": 1.8467, "step": 210 }, { "epoch": 0.030933880662659435, "grad_norm": 0.4758727550506592, "learning_rate": 7.5e-05, "loss": 2.0432, "step": 211 }, { "epoch": 0.031080486732150713, "grad_norm": 0.47905564308166504, "learning_rate": 7.5e-05, "loss": 2.1271, "step": 212 }, { "epoch": 0.031227092801641987, "grad_norm": 0.4165220856666565, "learning_rate": 7.5e-05, "loss": 2.0517, "step": 213 }, { "epoch": 0.03137369887113327, "grad_norm": 0.44275492429733276, "learning_rate": 7.5e-05, "loss": 2.1021, "step": 214 }, { "epoch": 0.03152030494062454, "grad_norm": 0.42875248193740845, "learning_rate": 7.5e-05, "loss": 2.0156, "step": 215 }, { "epoch": 0.03166691101011582, "grad_norm": 0.4269632399082184, "learning_rate": 7.5e-05, "loss": 2.1472, "step": 216 }, { "epoch": 0.0318135170796071, "grad_norm": 0.4364849328994751, "learning_rate": 7.5e-05, "loss": 2.0312, "step": 217 }, { "epoch": 0.03196012314909837, "grad_norm": 0.43865081667900085, "learning_rate": 7.5e-05, "loss": 1.9423, "step": 218 }, { "epoch": 0.03210672921858965, "grad_norm": 0.40435782074928284, "learning_rate": 7.5e-05, "loss": 1.973, "step": 219 }, { "epoch": 0.03225333528808093, "grad_norm": 0.41253232955932617, "learning_rate": 7.5e-05, "loss": 2.1178, "step": 220 }, { "epoch": 0.0323999413575722, "grad_norm": 0.4282158613204956, "learning_rate": 7.5e-05, "loss": 2.0244, "step": 221 }, { "epoch": 0.032546547427063484, "grad_norm": 0.42097708582878113, "learning_rate": 7.5e-05, "loss": 1.9321, "step": 222 }, { "epoch": 0.03269315349655476, "grad_norm": 0.4702489674091339, "learning_rate": 7.5e-05, "loss": 1.9824, "step": 223 }, { "epoch": 0.03283975956604603, "grad_norm": 0.45837345719337463, "learning_rate": 7.5e-05, "loss": 2.0477, "step": 224 }, { "epoch": 0.032986365635537314, "grad_norm": 0.4391036331653595, "learning_rate": 7.5e-05, "loss": 2.0382, "step": 225 }, { "epoch": 0.03313297170502859, "grad_norm": 0.4039275050163269, "learning_rate": 7.5e-05, "loss": 2.0819, "step": 226 }, { "epoch": 0.03327957777451986, "grad_norm": 0.4229048192501068, "learning_rate": 7.5e-05, "loss": 2.0042, "step": 227 }, { "epoch": 0.033426183844011144, "grad_norm": 0.4151551127433777, "learning_rate": 7.5e-05, "loss": 1.9566, "step": 228 }, { "epoch": 0.03357278991350242, "grad_norm": 0.4261118173599243, "learning_rate": 7.5e-05, "loss": 1.868, "step": 229 }, { "epoch": 0.03371939598299369, "grad_norm": 0.5024142861366272, "learning_rate": 7.5e-05, "loss": 2.1045, "step": 230 }, { "epoch": 0.033866002052484974, "grad_norm": 0.3946031630039215, "learning_rate": 7.5e-05, "loss": 1.9538, "step": 231 }, { "epoch": 0.03401260812197625, "grad_norm": 0.43463990092277527, "learning_rate": 7.5e-05, "loss": 1.8945, "step": 232 }, { "epoch": 0.03415921419146753, "grad_norm": 0.4228929579257965, "learning_rate": 7.5e-05, "loss": 2.0934, "step": 233 }, { "epoch": 0.034305820260958804, "grad_norm": 0.41771408915519714, "learning_rate": 7.5e-05, "loss": 2.0817, "step": 234 }, { "epoch": 0.03445242633045008, "grad_norm": 0.44674211740493774, "learning_rate": 7.5e-05, "loss": 2.1362, "step": 235 }, { "epoch": 0.03459903239994136, "grad_norm": 0.425772488117218, "learning_rate": 7.5e-05, "loss": 1.985, "step": 236 }, { "epoch": 0.034745638469432634, "grad_norm": 0.4421280324459076, "learning_rate": 7.5e-05, "loss": 2.0723, "step": 237 }, { "epoch": 0.03489224453892391, "grad_norm": 0.44242528080940247, "learning_rate": 7.5e-05, "loss": 1.9576, "step": 238 }, { "epoch": 0.03503885060841519, "grad_norm": 0.4266713261604309, "learning_rate": 7.5e-05, "loss": 1.9268, "step": 239 }, { "epoch": 0.035185456677906464, "grad_norm": 0.4214801490306854, "learning_rate": 7.5e-05, "loss": 1.8983, "step": 240 }, { "epoch": 0.035332062747397745, "grad_norm": 0.4246981143951416, "learning_rate": 7.5e-05, "loss": 2.024, "step": 241 }, { "epoch": 0.03547866881688902, "grad_norm": 0.44089213013648987, "learning_rate": 7.5e-05, "loss": 2.0077, "step": 242 }, { "epoch": 0.035625274886380294, "grad_norm": 0.3933759331703186, "learning_rate": 7.5e-05, "loss": 1.9355, "step": 243 }, { "epoch": 0.035771880955871575, "grad_norm": 0.4246898889541626, "learning_rate": 7.5e-05, "loss": 2.1393, "step": 244 }, { "epoch": 0.03591848702536285, "grad_norm": 0.39876532554626465, "learning_rate": 7.5e-05, "loss": 1.8837, "step": 245 }, { "epoch": 0.036065093094854124, "grad_norm": 0.41370299458503723, "learning_rate": 7.5e-05, "loss": 2.0528, "step": 246 }, { "epoch": 0.036211699164345405, "grad_norm": 0.41514405608177185, "learning_rate": 7.5e-05, "loss": 2.0402, "step": 247 }, { "epoch": 0.03635830523383668, "grad_norm": 0.42188283801078796, "learning_rate": 7.5e-05, "loss": 2.1093, "step": 248 }, { "epoch": 0.03650491130332796, "grad_norm": 0.42034000158309937, "learning_rate": 7.5e-05, "loss": 2.0502, "step": 249 }, { "epoch": 0.036651517372819235, "grad_norm": 0.4230118989944458, "learning_rate": 7.5e-05, "loss": 2.0837, "step": 250 }, { "epoch": 0.03679812344231051, "grad_norm": 0.4217623770236969, "learning_rate": 7.5e-05, "loss": 2.0339, "step": 251 }, { "epoch": 0.03694472951180179, "grad_norm": 0.5823216438293457, "learning_rate": 7.5e-05, "loss": 1.956, "step": 252 }, { "epoch": 0.037091335581293065, "grad_norm": 0.3720916211605072, "learning_rate": 7.5e-05, "loss": 1.8557, "step": 253 }, { "epoch": 0.03723794165078434, "grad_norm": 0.42141130566596985, "learning_rate": 7.5e-05, "loss": 2.1068, "step": 254 }, { "epoch": 0.03738454772027562, "grad_norm": 0.4116886854171753, "learning_rate": 7.5e-05, "loss": 1.9371, "step": 255 }, { "epoch": 0.037531153789766895, "grad_norm": 0.4347614645957947, "learning_rate": 7.5e-05, "loss": 1.9046, "step": 256 }, { "epoch": 0.03767775985925818, "grad_norm": 0.4594493806362152, "learning_rate": 7.5e-05, "loss": 1.9013, "step": 257 }, { "epoch": 0.03782436592874945, "grad_norm": 0.42262759804725647, "learning_rate": 7.5e-05, "loss": 2.0771, "step": 258 }, { "epoch": 0.037970971998240725, "grad_norm": 0.40743646025657654, "learning_rate": 7.5e-05, "loss": 1.9495, "step": 259 }, { "epoch": 0.03811757806773201, "grad_norm": 0.43572139739990234, "learning_rate": 7.5e-05, "loss": 2.1077, "step": 260 }, { "epoch": 0.03826418413722328, "grad_norm": 0.49967458844184875, "learning_rate": 7.5e-05, "loss": 1.9283, "step": 261 }, { "epoch": 0.038410790206714555, "grad_norm": 0.3929535746574402, "learning_rate": 7.5e-05, "loss": 1.944, "step": 262 }, { "epoch": 0.03855739627620584, "grad_norm": 0.4288821220397949, "learning_rate": 7.5e-05, "loss": 2.0917, "step": 263 }, { "epoch": 0.03870400234569711, "grad_norm": 0.4342331886291504, "learning_rate": 7.5e-05, "loss": 2.007, "step": 264 }, { "epoch": 0.038850608415188385, "grad_norm": 0.4641132354736328, "learning_rate": 7.5e-05, "loss": 1.982, "step": 265 }, { "epoch": 0.03899721448467967, "grad_norm": 0.4135822653770447, "learning_rate": 7.5e-05, "loss": 1.9517, "step": 266 }, { "epoch": 0.03914382055417094, "grad_norm": 0.42108359932899475, "learning_rate": 7.5e-05, "loss": 1.8839, "step": 267 }, { "epoch": 0.03929042662366222, "grad_norm": 0.40623417496681213, "learning_rate": 7.5e-05, "loss": 1.9456, "step": 268 }, { "epoch": 0.0394370326931535, "grad_norm": 0.4070262908935547, "learning_rate": 7.5e-05, "loss": 1.8654, "step": 269 }, { "epoch": 0.03958363876264477, "grad_norm": 0.600872278213501, "learning_rate": 7.5e-05, "loss": 1.9814, "step": 270 }, { "epoch": 0.03973024483213605, "grad_norm": 0.4230038523674011, "learning_rate": 7.5e-05, "loss": 1.6925, "step": 271 }, { "epoch": 0.03987685090162733, "grad_norm": 0.4146491587162018, "learning_rate": 7.5e-05, "loss": 2.0999, "step": 272 }, { "epoch": 0.0400234569711186, "grad_norm": 0.41269031167030334, "learning_rate": 7.5e-05, "loss": 2.049, "step": 273 }, { "epoch": 0.04017006304060988, "grad_norm": 0.40278056263923645, "learning_rate": 7.5e-05, "loss": 2.1052, "step": 274 }, { "epoch": 0.04031666911010116, "grad_norm": 0.684227705001831, "learning_rate": 7.5e-05, "loss": 2.0206, "step": 275 }, { "epoch": 0.04046327517959244, "grad_norm": 0.4093787968158722, "learning_rate": 7.5e-05, "loss": 2.1087, "step": 276 }, { "epoch": 0.04060988124908371, "grad_norm": 0.4116148352622986, "learning_rate": 7.5e-05, "loss": 1.9281, "step": 277 }, { "epoch": 0.04075648731857499, "grad_norm": 0.3918842375278473, "learning_rate": 7.5e-05, "loss": 1.8629, "step": 278 }, { "epoch": 0.04090309338806627, "grad_norm": 0.4154493510723114, "learning_rate": 7.5e-05, "loss": 1.8598, "step": 279 }, { "epoch": 0.04104969945755754, "grad_norm": 0.4297260046005249, "learning_rate": 7.5e-05, "loss": 1.9969, "step": 280 }, { "epoch": 0.04119630552704882, "grad_norm": 0.4116227924823761, "learning_rate": 7.5e-05, "loss": 2.0225, "step": 281 }, { "epoch": 0.0413429115965401, "grad_norm": 0.4207918643951416, "learning_rate": 7.5e-05, "loss": 1.9757, "step": 282 }, { "epoch": 0.04148951766603137, "grad_norm": 0.4006454646587372, "learning_rate": 7.5e-05, "loss": 1.8507, "step": 283 }, { "epoch": 0.041636123735522654, "grad_norm": 0.4324852228164673, "learning_rate": 7.5e-05, "loss": 2.0767, "step": 284 }, { "epoch": 0.04178272980501393, "grad_norm": 0.42393946647644043, "learning_rate": 7.5e-05, "loss": 1.8079, "step": 285 }, { "epoch": 0.0419293358745052, "grad_norm": 0.4340204894542694, "learning_rate": 7.5e-05, "loss": 1.7858, "step": 286 }, { "epoch": 0.042075941943996484, "grad_norm": 0.43480196595191956, "learning_rate": 7.5e-05, "loss": 1.9619, "step": 287 }, { "epoch": 0.04222254801348776, "grad_norm": 0.4417128264904022, "learning_rate": 7.5e-05, "loss": 1.9286, "step": 288 }, { "epoch": 0.04236915408297903, "grad_norm": 0.43333950638771057, "learning_rate": 7.5e-05, "loss": 2.0122, "step": 289 }, { "epoch": 0.042515760152470314, "grad_norm": 0.4021371006965637, "learning_rate": 7.5e-05, "loss": 1.9364, "step": 290 }, { "epoch": 0.04266236622196159, "grad_norm": 0.44192543625831604, "learning_rate": 7.5e-05, "loss": 1.8584, "step": 291 }, { "epoch": 0.04280897229145287, "grad_norm": 0.3991456627845764, "learning_rate": 7.5e-05, "loss": 1.922, "step": 292 }, { "epoch": 0.042955578360944144, "grad_norm": 0.4471362233161926, "learning_rate": 7.5e-05, "loss": 1.9382, "step": 293 }, { "epoch": 0.04310218443043542, "grad_norm": 0.48168301582336426, "learning_rate": 7.5e-05, "loss": 1.8899, "step": 294 }, { "epoch": 0.0432487904999267, "grad_norm": 0.44566604495048523, "learning_rate": 7.5e-05, "loss": 2.0752, "step": 295 }, { "epoch": 0.043395396569417974, "grad_norm": 0.43417659401893616, "learning_rate": 7.5e-05, "loss": 1.9763, "step": 296 }, { "epoch": 0.04354200263890925, "grad_norm": 0.42004889249801636, "learning_rate": 7.5e-05, "loss": 2.0579, "step": 297 }, { "epoch": 0.04368860870840053, "grad_norm": 0.40574243664741516, "learning_rate": 7.5e-05, "loss": 1.9657, "step": 298 }, { "epoch": 0.043835214777891804, "grad_norm": 0.4325792193412781, "learning_rate": 7.5e-05, "loss": 1.8716, "step": 299 }, { "epoch": 0.04398182084738308, "grad_norm": 0.4111328423023224, "learning_rate": 7.5e-05, "loss": 1.6846, "step": 300 }, { "epoch": 0.04412842691687436, "grad_norm": 0.4152776300907135, "learning_rate": 7.5e-05, "loss": 1.9749, "step": 301 }, { "epoch": 0.044275032986365634, "grad_norm": 0.43930619955062866, "learning_rate": 7.5e-05, "loss": 2.0843, "step": 302 }, { "epoch": 0.044421639055856915, "grad_norm": 0.4187251329421997, "learning_rate": 7.5e-05, "loss": 1.9617, "step": 303 }, { "epoch": 0.04456824512534819, "grad_norm": 0.4271850287914276, "learning_rate": 7.5e-05, "loss": 1.9084, "step": 304 }, { "epoch": 0.044714851194839464, "grad_norm": 0.40085408091545105, "learning_rate": 7.5e-05, "loss": 1.9012, "step": 305 }, { "epoch": 0.044861457264330745, "grad_norm": 0.42684438824653625, "learning_rate": 7.5e-05, "loss": 1.8549, "step": 306 }, { "epoch": 0.04500806333382202, "grad_norm": 0.4060433804988861, "learning_rate": 7.5e-05, "loss": 2.0108, "step": 307 }, { "epoch": 0.045154669403313294, "grad_norm": 0.44846153259277344, "learning_rate": 7.5e-05, "loss": 1.8809, "step": 308 }, { "epoch": 0.045301275472804575, "grad_norm": 0.4075014889240265, "learning_rate": 7.5e-05, "loss": 1.9013, "step": 309 }, { "epoch": 0.04544788154229585, "grad_norm": 0.4034115970134735, "learning_rate": 7.5e-05, "loss": 1.8061, "step": 310 }, { "epoch": 0.04559448761178713, "grad_norm": 0.4186389744281769, "learning_rate": 7.5e-05, "loss": 1.9598, "step": 311 }, { "epoch": 0.045741093681278405, "grad_norm": 0.43958598375320435, "learning_rate": 7.5e-05, "loss": 2.0047, "step": 312 }, { "epoch": 0.04588769975076968, "grad_norm": 0.43313688039779663, "learning_rate": 7.5e-05, "loss": 2.0501, "step": 313 }, { "epoch": 0.04603430582026096, "grad_norm": 0.42946475744247437, "learning_rate": 7.5e-05, "loss": 1.941, "step": 314 }, { "epoch": 0.046180911889752235, "grad_norm": 0.39245185256004333, "learning_rate": 7.5e-05, "loss": 1.924, "step": 315 }, { "epoch": 0.04632751795924351, "grad_norm": 0.3905038833618164, "learning_rate": 7.5e-05, "loss": 1.8175, "step": 316 }, { "epoch": 0.04647412402873479, "grad_norm": 0.431443452835083, "learning_rate": 7.5e-05, "loss": 1.8965, "step": 317 }, { "epoch": 0.046620730098226065, "grad_norm": 0.3863249719142914, "learning_rate": 7.5e-05, "loss": 1.9611, "step": 318 }, { "epoch": 0.04676733616771735, "grad_norm": 0.403775155544281, "learning_rate": 7.5e-05, "loss": 1.8096, "step": 319 }, { "epoch": 0.04691394223720862, "grad_norm": 0.3993397355079651, "learning_rate": 7.5e-05, "loss": 2.1315, "step": 320 }, { "epoch": 0.047060548306699895, "grad_norm": 0.4370652735233307, "learning_rate": 7.5e-05, "loss": 1.8833, "step": 321 }, { "epoch": 0.04720715437619118, "grad_norm": 0.4299817681312561, "learning_rate": 7.5e-05, "loss": 1.8662, "step": 322 }, { "epoch": 0.04735376044568245, "grad_norm": 0.3996736407279968, "learning_rate": 7.5e-05, "loss": 1.7324, "step": 323 }, { "epoch": 0.047500366515173725, "grad_norm": Infinity, "learning_rate": 7.5e-05, "loss": 2.0917, "step": 324 }, { "epoch": 0.04764697258466501, "grad_norm": 0.39298534393310547, "learning_rate": 7.5e-05, "loss": 2.0445, "step": 325 }, { "epoch": 0.04779357865415628, "grad_norm": 0.4032658040523529, "learning_rate": 7.5e-05, "loss": 2.0522, "step": 326 }, { "epoch": 0.04794018472364756, "grad_norm": 0.42470312118530273, "learning_rate": 7.5e-05, "loss": 2.0347, "step": 327 }, { "epoch": 0.04808679079313884, "grad_norm": 0.4265693128108978, "learning_rate": 7.5e-05, "loss": 2.0271, "step": 328 }, { "epoch": 0.04823339686263011, "grad_norm": 0.4272522032260895, "learning_rate": 7.5e-05, "loss": 1.9173, "step": 329 }, { "epoch": 0.04838000293212139, "grad_norm": 0.4321386218070984, "learning_rate": 7.5e-05, "loss": 1.9021, "step": 330 }, { "epoch": 0.04852660900161267, "grad_norm": 0.40172508358955383, "learning_rate": 7.5e-05, "loss": 1.7721, "step": 331 }, { "epoch": 0.04867321507110394, "grad_norm": 0.4164409637451172, "learning_rate": 7.5e-05, "loss": 1.8143, "step": 332 }, { "epoch": 0.04881982114059522, "grad_norm": 0.47102516889572144, "learning_rate": 7.5e-05, "loss": 1.8947, "step": 333 }, { "epoch": 0.0489664272100865, "grad_norm": 0.41888296604156494, "learning_rate": 7.5e-05, "loss": 1.7843, "step": 334 }, { "epoch": 0.04911303327957778, "grad_norm": 0.3950113356113434, "learning_rate": 7.5e-05, "loss": 1.7458, "step": 335 }, { "epoch": 0.04925963934906905, "grad_norm": 0.41272109746932983, "learning_rate": 7.5e-05, "loss": 2.0543, "step": 336 }, { "epoch": 0.04940624541856033, "grad_norm": 0.43312135338783264, "learning_rate": 7.5e-05, "loss": 1.8529, "step": 337 }, { "epoch": 0.04955285148805161, "grad_norm": 0.423692911863327, "learning_rate": 7.5e-05, "loss": 2.0786, "step": 338 }, { "epoch": 0.04969945755754288, "grad_norm": 0.4291779696941376, "learning_rate": 7.5e-05, "loss": 1.8429, "step": 339 }, { "epoch": 0.04984606362703416, "grad_norm": 0.3997940719127655, "learning_rate": 7.5e-05, "loss": 2.0066, "step": 340 }, { "epoch": 0.04999266969652544, "grad_norm": 0.42505770921707153, "learning_rate": 7.5e-05, "loss": 1.8996, "step": 341 }, { "epoch": 0.05013927576601671, "grad_norm": 0.7379485964775085, "learning_rate": 7.5e-05, "loss": 1.8577, "step": 342 }, { "epoch": 0.05028588183550799, "grad_norm": 0.43901753425598145, "learning_rate": 7.5e-05, "loss": 1.9743, "step": 343 }, { "epoch": 0.05043248790499927, "grad_norm": 0.4152974784374237, "learning_rate": 7.5e-05, "loss": 1.8597, "step": 344 }, { "epoch": 0.05057909397449054, "grad_norm": 0.42442166805267334, "learning_rate": 7.5e-05, "loss": 1.9777, "step": 345 }, { "epoch": 0.050725700043981824, "grad_norm": 0.4314291775226593, "learning_rate": 7.5e-05, "loss": 1.9094, "step": 346 }, { "epoch": 0.0508723061134731, "grad_norm": 0.42771726846694946, "learning_rate": 7.5e-05, "loss": 2.079, "step": 347 }, { "epoch": 0.05101891218296437, "grad_norm": 0.46686893701553345, "learning_rate": 7.5e-05, "loss": 2.0029, "step": 348 }, { "epoch": 0.051165518252455654, "grad_norm": 0.4192800223827362, "learning_rate": 7.5e-05, "loss": 1.8492, "step": 349 }, { "epoch": 0.05131212432194693, "grad_norm": 0.4271966218948364, "learning_rate": 7.5e-05, "loss": 1.9028, "step": 350 }, { "epoch": 0.0514587303914382, "grad_norm": 0.3974759578704834, "learning_rate": 7.5e-05, "loss": 2.0359, "step": 351 }, { "epoch": 0.051605336460929484, "grad_norm": 0.4625302255153656, "learning_rate": 7.5e-05, "loss": 1.9221, "step": 352 }, { "epoch": 0.05175194253042076, "grad_norm": 0.46180111169815063, "learning_rate": 7.5e-05, "loss": 1.9657, "step": 353 }, { "epoch": 0.05189854859991204, "grad_norm": 0.3923051655292511, "learning_rate": 7.5e-05, "loss": 2.0206, "step": 354 }, { "epoch": 0.052045154669403314, "grad_norm": 0.38152098655700684, "learning_rate": 7.5e-05, "loss": 1.9095, "step": 355 }, { "epoch": 0.05219176073889459, "grad_norm": 0.3919423222541809, "learning_rate": 7.5e-05, "loss": 1.9029, "step": 356 }, { "epoch": 0.05233836680838587, "grad_norm": 0.42000991106033325, "learning_rate": 7.5e-05, "loss": 1.8228, "step": 357 }, { "epoch": 0.052484972877877144, "grad_norm": 0.422067254781723, "learning_rate": 7.5e-05, "loss": 1.8618, "step": 358 }, { "epoch": 0.05263157894736842, "grad_norm": 0.4018467962741852, "learning_rate": 7.5e-05, "loss": 1.7979, "step": 359 }, { "epoch": 0.0527781850168597, "grad_norm": 0.3940626084804535, "learning_rate": 7.5e-05, "loss": 1.9341, "step": 360 }, { "epoch": 0.052924791086350974, "grad_norm": 0.4218617081642151, "learning_rate": 7.5e-05, "loss": 1.931, "step": 361 }, { "epoch": 0.053071397155842255, "grad_norm": 0.39889469742774963, "learning_rate": 7.5e-05, "loss": 1.9289, "step": 362 }, { "epoch": 0.05321800322533353, "grad_norm": 0.3963461220264435, "learning_rate": 7.5e-05, "loss": 1.9849, "step": 363 }, { "epoch": 0.053364609294824804, "grad_norm": 0.40507304668426514, "learning_rate": 7.5e-05, "loss": 1.6929, "step": 364 }, { "epoch": 0.053511215364316085, "grad_norm": 0.38420289754867554, "learning_rate": 7.5e-05, "loss": 1.9159, "step": 365 }, { "epoch": 0.05365782143380736, "grad_norm": 0.46198320388793945, "learning_rate": 7.5e-05, "loss": 2.0082, "step": 366 }, { "epoch": 0.053804427503298634, "grad_norm": 0.38492777943611145, "learning_rate": 7.5e-05, "loss": 1.9033, "step": 367 }, { "epoch": 0.053951033572789915, "grad_norm": 0.4116688370704651, "learning_rate": 7.5e-05, "loss": 1.9682, "step": 368 }, { "epoch": 0.05409763964228119, "grad_norm": 0.44960904121398926, "learning_rate": 7.5e-05, "loss": 1.9626, "step": 369 }, { "epoch": 0.05424424571177247, "grad_norm": 0.39794275164604187, "learning_rate": 7.5e-05, "loss": 1.8855, "step": 370 }, { "epoch": 0.054390851781263745, "grad_norm": 0.4340926706790924, "learning_rate": 7.5e-05, "loss": 2.0028, "step": 371 }, { "epoch": 0.05453745785075502, "grad_norm": 0.39723294973373413, "learning_rate": 7.5e-05, "loss": 1.9301, "step": 372 }, { "epoch": 0.0546840639202463, "grad_norm": 0.45371681451797485, "learning_rate": 7.5e-05, "loss": 1.7457, "step": 373 }, { "epoch": 0.054830669989737575, "grad_norm": 0.5021353363990784, "learning_rate": 7.5e-05, "loss": 1.855, "step": 374 }, { "epoch": 0.05497727605922885, "grad_norm": 0.37673622369766235, "learning_rate": 7.5e-05, "loss": 1.889, "step": 375 }, { "epoch": 0.05512388212872013, "grad_norm": 0.4026258587837219, "learning_rate": 7.5e-05, "loss": 2.0549, "step": 376 }, { "epoch": 0.055270488198211405, "grad_norm": 0.41268256306648254, "learning_rate": 7.5e-05, "loss": 1.8186, "step": 377 }, { "epoch": 0.05541709426770268, "grad_norm": 0.40261054039001465, "learning_rate": 7.5e-05, "loss": 2.0186, "step": 378 }, { "epoch": 0.05556370033719396, "grad_norm": 0.412936270236969, "learning_rate": 7.5e-05, "loss": 1.9663, "step": 379 }, { "epoch": 0.055710306406685235, "grad_norm": 0.4268585741519928, "learning_rate": 7.5e-05, "loss": 1.8892, "step": 380 }, { "epoch": 0.055856912476176517, "grad_norm": 0.40223076939582825, "learning_rate": 7.5e-05, "loss": 2.0457, "step": 381 }, { "epoch": 0.05600351854566779, "grad_norm": 0.3755423128604889, "learning_rate": 7.5e-05, "loss": 1.8299, "step": 382 }, { "epoch": 0.056150124615159065, "grad_norm": 0.3995567858219147, "learning_rate": 7.5e-05, "loss": 1.9749, "step": 383 }, { "epoch": 0.056296730684650347, "grad_norm": 0.3993961215019226, "learning_rate": 7.5e-05, "loss": 2.0254, "step": 384 }, { "epoch": 0.05644333675414162, "grad_norm": 0.4198998212814331, "learning_rate": 7.5e-05, "loss": 1.9508, "step": 385 }, { "epoch": 0.056589942823632895, "grad_norm": 0.39024171233177185, "learning_rate": 7.5e-05, "loss": 1.9071, "step": 386 }, { "epoch": 0.05673654889312418, "grad_norm": 0.4029495418071747, "learning_rate": 7.5e-05, "loss": 1.8866, "step": 387 }, { "epoch": 0.05688315496261545, "grad_norm": 0.4082452356815338, "learning_rate": 7.5e-05, "loss": 2.0262, "step": 388 }, { "epoch": 0.05702976103210673, "grad_norm": 0.39697325229644775, "learning_rate": 7.5e-05, "loss": 1.8344, "step": 389 }, { "epoch": 0.05717636710159801, "grad_norm": 0.4114176332950592, "learning_rate": 7.5e-05, "loss": 2.0149, "step": 390 }, { "epoch": 0.05732297317108928, "grad_norm": 0.4068869352340698, "learning_rate": 7.5e-05, "loss": 1.9518, "step": 391 }, { "epoch": 0.05746957924058056, "grad_norm": 0.38898399472236633, "learning_rate": 7.5e-05, "loss": 1.7891, "step": 392 }, { "epoch": 0.05761618531007184, "grad_norm": 0.39212045073509216, "learning_rate": 7.5e-05, "loss": 1.9879, "step": 393 }, { "epoch": 0.05776279137956311, "grad_norm": 0.3848034143447876, "learning_rate": 7.5e-05, "loss": 1.803, "step": 394 }, { "epoch": 0.05790939744905439, "grad_norm": 0.40813377499580383, "learning_rate": 7.5e-05, "loss": 1.8522, "step": 395 }, { "epoch": 0.05805600351854567, "grad_norm": 0.41719958186149597, "learning_rate": 7.5e-05, "loss": 2.0428, "step": 396 }, { "epoch": 0.05820260958803695, "grad_norm": 0.395730197429657, "learning_rate": 7.5e-05, "loss": 1.9202, "step": 397 }, { "epoch": 0.05834921565752822, "grad_norm": 0.40946465730667114, "learning_rate": 7.5e-05, "loss": 1.752, "step": 398 }, { "epoch": 0.0584958217270195, "grad_norm": 0.40470823645591736, "learning_rate": 7.5e-05, "loss": 1.8941, "step": 399 }, { "epoch": 0.05864242779651078, "grad_norm": 0.4607287347316742, "learning_rate": 7.5e-05, "loss": 1.8073, "step": 400 }, { "epoch": 0.05864242779651078, "eval_loss": 1.9215489625930786, "eval_runtime": 41.3249, "eval_samples_per_second": 13.285, "eval_steps_per_second": 6.655, "step": 400 }, { "epoch": 0.05878903386600205, "grad_norm": 0.40069201588630676, "learning_rate": 7.5e-05, "loss": 1.9312, "step": 401 }, { "epoch": 0.05893563993549333, "grad_norm": 0.3964191675186157, "learning_rate": 7.5e-05, "loss": 1.9439, "step": 402 }, { "epoch": 0.05908224600498461, "grad_norm": 0.38090893626213074, "learning_rate": 7.5e-05, "loss": 1.8616, "step": 403 }, { "epoch": 0.05922885207447588, "grad_norm": 0.37955179810523987, "learning_rate": 7.5e-05, "loss": 1.88, "step": 404 }, { "epoch": 0.059375458143967164, "grad_norm": 0.3823492228984833, "learning_rate": 7.5e-05, "loss": 1.9305, "step": 405 }, { "epoch": 0.05952206421345844, "grad_norm": 0.39403703808784485, "learning_rate": 7.5e-05, "loss": 1.9504, "step": 406 }, { "epoch": 0.05966867028294971, "grad_norm": 0.419249027967453, "learning_rate": 7.5e-05, "loss": 1.9252, "step": 407 }, { "epoch": 0.059815276352440994, "grad_norm": 0.3928826153278351, "learning_rate": 7.5e-05, "loss": 1.8077, "step": 408 }, { "epoch": 0.05996188242193227, "grad_norm": 0.3894938826560974, "learning_rate": 7.5e-05, "loss": 2.0051, "step": 409 }, { "epoch": 0.06010848849142354, "grad_norm": 0.43907618522644043, "learning_rate": 7.5e-05, "loss": 1.8877, "step": 410 }, { "epoch": 0.060255094560914824, "grad_norm": 0.39697274565696716, "learning_rate": 7.5e-05, "loss": 1.96, "step": 411 }, { "epoch": 0.0604017006304061, "grad_norm": 0.4296260178089142, "learning_rate": 7.5e-05, "loss": 2.0203, "step": 412 }, { "epoch": 0.06054830669989737, "grad_norm": 0.428838312625885, "learning_rate": 7.5e-05, "loss": 1.9341, "step": 413 }, { "epoch": 0.060694912769388654, "grad_norm": 0.4036642909049988, "learning_rate": 7.5e-05, "loss": 2.0411, "step": 414 }, { "epoch": 0.06084151883887993, "grad_norm": 0.42142724990844727, "learning_rate": 7.5e-05, "loss": 1.8061, "step": 415 }, { "epoch": 0.06098812490837121, "grad_norm": 0.401538223028183, "learning_rate": 7.5e-05, "loss": 1.947, "step": 416 }, { "epoch": 0.061134730977862484, "grad_norm": 0.3859585225582123, "learning_rate": 7.5e-05, "loss": 1.8461, "step": 417 }, { "epoch": 0.06128133704735376, "grad_norm": 0.40697985887527466, "learning_rate": 7.5e-05, "loss": 1.8269, "step": 418 }, { "epoch": 0.06142794311684504, "grad_norm": 0.41310885548591614, "learning_rate": 7.5e-05, "loss": 1.883, "step": 419 }, { "epoch": 0.061574549186336314, "grad_norm": 0.4457895755767822, "learning_rate": 7.5e-05, "loss": 1.8506, "step": 420 }, { "epoch": 0.06172115525582759, "grad_norm": 0.42779678106307983, "learning_rate": 7.5e-05, "loss": 2.1534, "step": 421 }, { "epoch": 0.06186776132531887, "grad_norm": 0.3979230225086212, "learning_rate": 7.5e-05, "loss": 2.0296, "step": 422 }, { "epoch": 0.062014367394810144, "grad_norm": 0.3966233432292938, "learning_rate": 7.5e-05, "loss": 2.0088, "step": 423 }, { "epoch": 0.062160973464301425, "grad_norm": 0.41038623452186584, "learning_rate": 7.5e-05, "loss": 1.8389, "step": 424 }, { "epoch": 0.0623075795337927, "grad_norm": 0.4148133099079132, "learning_rate": 7.5e-05, "loss": 1.7872, "step": 425 }, { "epoch": 0.062454185603283974, "grad_norm": 0.42491552233695984, "learning_rate": 7.5e-05, "loss": 1.847, "step": 426 }, { "epoch": 0.06260079167277526, "grad_norm": 0.4301494359970093, "learning_rate": 7.5e-05, "loss": 1.933, "step": 427 }, { "epoch": 0.06274739774226654, "grad_norm": 0.39107269048690796, "learning_rate": 7.5e-05, "loss": 1.8206, "step": 428 }, { "epoch": 0.0628940038117578, "grad_norm": 0.4006537199020386, "learning_rate": 7.5e-05, "loss": 1.8767, "step": 429 }, { "epoch": 0.06304060988124909, "grad_norm": 0.4270484447479248, "learning_rate": 7.5e-05, "loss": 1.7858, "step": 430 }, { "epoch": 0.06318721595074037, "grad_norm": 0.49049848318099976, "learning_rate": 7.5e-05, "loss": 1.9357, "step": 431 }, { "epoch": 0.06333382202023163, "grad_norm": 0.502880334854126, "learning_rate": 7.5e-05, "loss": 1.9019, "step": 432 }, { "epoch": 0.06348042808972292, "grad_norm": 0.38482630252838135, "learning_rate": 7.5e-05, "loss": 1.9503, "step": 433 }, { "epoch": 0.0636270341592142, "grad_norm": 0.4305956959724426, "learning_rate": 7.5e-05, "loss": 1.9875, "step": 434 }, { "epoch": 0.06377364022870546, "grad_norm": 0.4431181848049164, "learning_rate": 7.5e-05, "loss": 1.8394, "step": 435 }, { "epoch": 0.06392024629819675, "grad_norm": 0.41794490814208984, "learning_rate": 7.5e-05, "loss": 1.9012, "step": 436 }, { "epoch": 0.06406685236768803, "grad_norm": 0.4589599370956421, "learning_rate": 7.5e-05, "loss": 1.9891, "step": 437 }, { "epoch": 0.0642134584371793, "grad_norm": 0.4177159368991852, "learning_rate": 7.5e-05, "loss": 1.7983, "step": 438 }, { "epoch": 0.06436006450667058, "grad_norm": 0.4119695723056793, "learning_rate": 7.5e-05, "loss": 1.8253, "step": 439 }, { "epoch": 0.06450667057616186, "grad_norm": 0.43098920583724976, "learning_rate": 7.5e-05, "loss": 1.8859, "step": 440 }, { "epoch": 0.06465327664565312, "grad_norm": 0.4241313934326172, "learning_rate": 7.5e-05, "loss": 1.8772, "step": 441 }, { "epoch": 0.0647998827151444, "grad_norm": 0.42193734645843506, "learning_rate": 7.5e-05, "loss": 1.9616, "step": 442 }, { "epoch": 0.06494648878463569, "grad_norm": 0.3972153961658478, "learning_rate": 7.5e-05, "loss": 1.8275, "step": 443 }, { "epoch": 0.06509309485412697, "grad_norm": 0.4085344970226288, "learning_rate": 7.5e-05, "loss": 2.019, "step": 444 }, { "epoch": 0.06523970092361824, "grad_norm": 0.3946464955806732, "learning_rate": 7.5e-05, "loss": 1.9052, "step": 445 }, { "epoch": 0.06538630699310952, "grad_norm": 0.3861629366874695, "learning_rate": 7.5e-05, "loss": 1.9889, "step": 446 }, { "epoch": 0.0655329130626008, "grad_norm": 0.3949719965457916, "learning_rate": 7.5e-05, "loss": 1.8659, "step": 447 }, { "epoch": 0.06567951913209207, "grad_norm": 0.41953012347221375, "learning_rate": 7.5e-05, "loss": 1.7151, "step": 448 }, { "epoch": 0.06582612520158335, "grad_norm": 0.3989837169647217, "learning_rate": 7.5e-05, "loss": 1.9242, "step": 449 }, { "epoch": 0.06597273127107463, "grad_norm": 0.4501884877681732, "learning_rate": 7.5e-05, "loss": 2.0341, "step": 450 }, { "epoch": 0.0661193373405659, "grad_norm": 0.4083380699157715, "learning_rate": 7.5e-05, "loss": 1.9373, "step": 451 }, { "epoch": 0.06626594341005718, "grad_norm": 0.40291735529899597, "learning_rate": 7.5e-05, "loss": 1.9269, "step": 452 }, { "epoch": 0.06641254947954846, "grad_norm": 0.37228837609291077, "learning_rate": 7.5e-05, "loss": 1.9983, "step": 453 }, { "epoch": 0.06655915554903973, "grad_norm": 0.41136473417282104, "learning_rate": 7.5e-05, "loss": 2.002, "step": 454 }, { "epoch": 0.066705761618531, "grad_norm": 0.39387956261634827, "learning_rate": 7.5e-05, "loss": 2.0881, "step": 455 }, { "epoch": 0.06685236768802229, "grad_norm": 0.419813334941864, "learning_rate": 7.5e-05, "loss": 1.9009, "step": 456 }, { "epoch": 0.06699897375751356, "grad_norm": 0.6207624077796936, "learning_rate": 7.5e-05, "loss": 1.9464, "step": 457 }, { "epoch": 0.06714557982700484, "grad_norm": 0.43503573536872864, "learning_rate": 7.5e-05, "loss": 1.8342, "step": 458 }, { "epoch": 0.06729218589649612, "grad_norm": 0.4188987910747528, "learning_rate": 7.5e-05, "loss": 1.9974, "step": 459 }, { "epoch": 0.06743879196598739, "grad_norm": 0.41663435101509094, "learning_rate": 7.5e-05, "loss": 2.0094, "step": 460 }, { "epoch": 0.06758539803547867, "grad_norm": 0.4083940088748932, "learning_rate": 7.5e-05, "loss": 1.8072, "step": 461 }, { "epoch": 0.06773200410496995, "grad_norm": 0.4099334478378296, "learning_rate": 7.5e-05, "loss": 1.9012, "step": 462 }, { "epoch": 0.06787861017446123, "grad_norm": 0.39153531193733215, "learning_rate": 7.5e-05, "loss": 1.9784, "step": 463 }, { "epoch": 0.0680252162439525, "grad_norm": 0.3963199853897095, "learning_rate": 7.5e-05, "loss": 1.8558, "step": 464 }, { "epoch": 0.06817182231344378, "grad_norm": 0.4018481969833374, "learning_rate": 7.5e-05, "loss": 1.9385, "step": 465 }, { "epoch": 0.06831842838293506, "grad_norm": 0.42871642112731934, "learning_rate": 7.5e-05, "loss": 1.7274, "step": 466 }, { "epoch": 0.06846503445242633, "grad_norm": 0.39588871598243713, "learning_rate": 7.5e-05, "loss": 1.8423, "step": 467 }, { "epoch": 0.06861164052191761, "grad_norm": 0.41026967763900757, "learning_rate": 7.5e-05, "loss": 1.9033, "step": 468 }, { "epoch": 0.06875824659140889, "grad_norm": 0.37561357021331787, "learning_rate": 7.5e-05, "loss": 1.6374, "step": 469 }, { "epoch": 0.06890485266090016, "grad_norm": 0.3817889094352722, "learning_rate": 7.5e-05, "loss": 1.6896, "step": 470 }, { "epoch": 0.06905145873039144, "grad_norm": 0.4611549377441406, "learning_rate": 7.5e-05, "loss": 2.0365, "step": 471 }, { "epoch": 0.06919806479988272, "grad_norm": 0.39855924248695374, "learning_rate": 7.5e-05, "loss": 1.8572, "step": 472 }, { "epoch": 0.06934467086937399, "grad_norm": 0.41545939445495605, "learning_rate": 7.5e-05, "loss": 1.9143, "step": 473 }, { "epoch": 0.06949127693886527, "grad_norm": 0.42043688893318176, "learning_rate": 7.5e-05, "loss": 1.9725, "step": 474 }, { "epoch": 0.06963788300835655, "grad_norm": 0.4160537123680115, "learning_rate": 7.5e-05, "loss": 1.922, "step": 475 }, { "epoch": 0.06978448907784782, "grad_norm": 0.4002605974674225, "learning_rate": 7.5e-05, "loss": 1.9837, "step": 476 }, { "epoch": 0.0699310951473391, "grad_norm": 0.3853197395801544, "learning_rate": 7.5e-05, "loss": 1.8638, "step": 477 }, { "epoch": 0.07007770121683038, "grad_norm": 0.423869252204895, "learning_rate": 7.5e-05, "loss": 1.9102, "step": 478 }, { "epoch": 0.07022430728632166, "grad_norm": 0.3727932572364807, "learning_rate": 7.5e-05, "loss": 1.8472, "step": 479 }, { "epoch": 0.07037091335581293, "grad_norm": 0.38481828570365906, "learning_rate": 7.5e-05, "loss": 1.8613, "step": 480 }, { "epoch": 0.07051751942530421, "grad_norm": 0.4016866981983185, "learning_rate": 7.5e-05, "loss": 1.9496, "step": 481 }, { "epoch": 0.07066412549479549, "grad_norm": 0.4132956266403198, "learning_rate": 7.5e-05, "loss": 1.9789, "step": 482 }, { "epoch": 0.07081073156428676, "grad_norm": 0.3970588445663452, "learning_rate": 7.5e-05, "loss": 1.9003, "step": 483 }, { "epoch": 0.07095733763377804, "grad_norm": 0.3769996762275696, "learning_rate": 7.5e-05, "loss": 2.0615, "step": 484 }, { "epoch": 0.07110394370326932, "grad_norm": 0.3887600004673004, "learning_rate": 7.5e-05, "loss": 1.7547, "step": 485 }, { "epoch": 0.07125054977276059, "grad_norm": 0.4363369047641754, "learning_rate": 7.5e-05, "loss": 1.9918, "step": 486 }, { "epoch": 0.07139715584225187, "grad_norm": 0.40317365527153015, "learning_rate": 7.5e-05, "loss": 1.8964, "step": 487 }, { "epoch": 0.07154376191174315, "grad_norm": 0.43126386404037476, "learning_rate": 7.5e-05, "loss": 1.6955, "step": 488 }, { "epoch": 0.07169036798123442, "grad_norm": 0.42372989654541016, "learning_rate": 7.5e-05, "loss": 1.8911, "step": 489 }, { "epoch": 0.0718369740507257, "grad_norm": 0.3905870020389557, "learning_rate": 7.5e-05, "loss": 1.824, "step": 490 }, { "epoch": 0.07198358012021698, "grad_norm": 0.41900378465652466, "learning_rate": 7.5e-05, "loss": 2.1843, "step": 491 }, { "epoch": 0.07213018618970825, "grad_norm": 0.43258675932884216, "learning_rate": 7.5e-05, "loss": 1.9822, "step": 492 }, { "epoch": 0.07227679225919953, "grad_norm": 0.37334272265434265, "learning_rate": 7.5e-05, "loss": 1.7662, "step": 493 }, { "epoch": 0.07242339832869081, "grad_norm": 0.3922608196735382, "learning_rate": 7.5e-05, "loss": 1.9752, "step": 494 }, { "epoch": 0.07257000439818208, "grad_norm": 0.4066004753112793, "learning_rate": 7.5e-05, "loss": 1.849, "step": 495 }, { "epoch": 0.07271661046767336, "grad_norm": 0.38720276951789856, "learning_rate": 7.5e-05, "loss": 1.896, "step": 496 }, { "epoch": 0.07286321653716464, "grad_norm": 0.3772609531879425, "learning_rate": 7.5e-05, "loss": 1.9367, "step": 497 }, { "epoch": 0.07300982260665592, "grad_norm": 0.3675610423088074, "learning_rate": 7.5e-05, "loss": 2.0532, "step": 498 }, { "epoch": 0.07315642867614719, "grad_norm": 0.41309303045272827, "learning_rate": 7.5e-05, "loss": 1.8303, "step": 499 }, { "epoch": 0.07330303474563847, "grad_norm": 0.4001958668231964, "learning_rate": 7.5e-05, "loss": 1.8457, "step": 500 }, { "epoch": 0.07344964081512975, "grad_norm": 0.40064096450805664, "learning_rate": 7.5e-05, "loss": 1.7482, "step": 501 }, { "epoch": 0.07359624688462102, "grad_norm": 0.41334855556488037, "learning_rate": 7.5e-05, "loss": 1.9612, "step": 502 }, { "epoch": 0.0737428529541123, "grad_norm": 0.39485424757003784, "learning_rate": 7.5e-05, "loss": 1.9812, "step": 503 }, { "epoch": 0.07388945902360358, "grad_norm": 0.4202350676059723, "learning_rate": 7.5e-05, "loss": 1.7311, "step": 504 }, { "epoch": 0.07403606509309485, "grad_norm": 0.396450012922287, "learning_rate": 7.5e-05, "loss": 1.7788, "step": 505 }, { "epoch": 0.07418267116258613, "grad_norm": 0.38840800523757935, "learning_rate": 7.5e-05, "loss": 1.8075, "step": 506 }, { "epoch": 0.07432927723207741, "grad_norm": 0.3949303925037384, "learning_rate": 7.5e-05, "loss": 1.7242, "step": 507 }, { "epoch": 0.07447588330156868, "grad_norm": 0.38407137989997864, "learning_rate": 7.5e-05, "loss": 1.7873, "step": 508 }, { "epoch": 0.07462248937105996, "grad_norm": 0.38320761919021606, "learning_rate": 7.5e-05, "loss": 1.8611, "step": 509 }, { "epoch": 0.07476909544055124, "grad_norm": 0.4282570481300354, "learning_rate": 7.5e-05, "loss": 2.0662, "step": 510 }, { "epoch": 0.07491570151004251, "grad_norm": 0.3792576193809509, "learning_rate": 7.5e-05, "loss": 2.0171, "step": 511 }, { "epoch": 0.07506230757953379, "grad_norm": 0.3987075090408325, "learning_rate": 7.5e-05, "loss": 1.9697, "step": 512 }, { "epoch": 0.07520891364902507, "grad_norm": 0.41438838839530945, "learning_rate": 7.5e-05, "loss": 1.9468, "step": 513 }, { "epoch": 0.07535551971851635, "grad_norm": 0.40679264068603516, "learning_rate": 7.5e-05, "loss": 1.9417, "step": 514 }, { "epoch": 0.07550212578800762, "grad_norm": 0.38560354709625244, "learning_rate": 7.5e-05, "loss": 1.7867, "step": 515 }, { "epoch": 0.0756487318574989, "grad_norm": 0.41253578662872314, "learning_rate": 7.5e-05, "loss": 1.7543, "step": 516 }, { "epoch": 0.07579533792699018, "grad_norm": 0.3849467933177948, "learning_rate": 7.5e-05, "loss": 1.8884, "step": 517 }, { "epoch": 0.07594194399648145, "grad_norm": 0.380755752325058, "learning_rate": 7.5e-05, "loss": 1.9158, "step": 518 }, { "epoch": 0.07608855006597273, "grad_norm": 0.43136078119277954, "learning_rate": 7.5e-05, "loss": 1.7965, "step": 519 }, { "epoch": 0.07623515613546401, "grad_norm": 0.39520618319511414, "learning_rate": 7.5e-05, "loss": 1.8127, "step": 520 }, { "epoch": 0.07638176220495528, "grad_norm": 0.3907357156276703, "learning_rate": 7.5e-05, "loss": 2.0351, "step": 521 }, { "epoch": 0.07652836827444656, "grad_norm": 0.3815540373325348, "learning_rate": 7.5e-05, "loss": 2.0078, "step": 522 }, { "epoch": 0.07667497434393784, "grad_norm": 0.37077435851097107, "learning_rate": 7.5e-05, "loss": 1.848, "step": 523 }, { "epoch": 0.07682158041342911, "grad_norm": 0.3760091960430145, "learning_rate": 7.5e-05, "loss": 1.8746, "step": 524 }, { "epoch": 0.07696818648292039, "grad_norm": 0.39992037415504456, "learning_rate": 7.5e-05, "loss": 2.0355, "step": 525 }, { "epoch": 0.07711479255241167, "grad_norm": 0.399991512298584, "learning_rate": 7.5e-05, "loss": 1.9635, "step": 526 }, { "epoch": 0.07726139862190294, "grad_norm": 0.3901393413543701, "learning_rate": 7.5e-05, "loss": 1.8488, "step": 527 }, { "epoch": 0.07740800469139422, "grad_norm": 0.38714471459388733, "learning_rate": 7.5e-05, "loss": 1.7786, "step": 528 }, { "epoch": 0.0775546107608855, "grad_norm": 0.37585464119911194, "learning_rate": 7.5e-05, "loss": 1.9512, "step": 529 }, { "epoch": 0.07770121683037677, "grad_norm": 0.36969706416130066, "learning_rate": 7.5e-05, "loss": 1.9106, "step": 530 }, { "epoch": 0.07784782289986805, "grad_norm": 0.39556166529655457, "learning_rate": 7.5e-05, "loss": 1.7393, "step": 531 }, { "epoch": 0.07799442896935933, "grad_norm": 0.4319993257522583, "learning_rate": 7.5e-05, "loss": 1.7583, "step": 532 }, { "epoch": 0.07814103503885061, "grad_norm": 0.39138415455818176, "learning_rate": 7.5e-05, "loss": 2.0428, "step": 533 }, { "epoch": 0.07828764110834188, "grad_norm": 0.4019283652305603, "learning_rate": 7.5e-05, "loss": 1.9562, "step": 534 }, { "epoch": 0.07843424717783316, "grad_norm": 0.4018367826938629, "learning_rate": 7.5e-05, "loss": 1.8743, "step": 535 }, { "epoch": 0.07858085324732444, "grad_norm": 0.39853426814079285, "learning_rate": 7.5e-05, "loss": 1.7325, "step": 536 }, { "epoch": 0.07872745931681571, "grad_norm": 0.43910351395606995, "learning_rate": 7.5e-05, "loss": 1.921, "step": 537 }, { "epoch": 0.078874065386307, "grad_norm": 0.41049429774284363, "learning_rate": 7.5e-05, "loss": 1.8563, "step": 538 }, { "epoch": 0.07902067145579827, "grad_norm": 0.4047844409942627, "learning_rate": 7.5e-05, "loss": 1.8244, "step": 539 }, { "epoch": 0.07916727752528954, "grad_norm": 0.3832705318927765, "learning_rate": 7.5e-05, "loss": 1.8544, "step": 540 }, { "epoch": 0.07931388359478082, "grad_norm": 0.38958606123924255, "learning_rate": 7.5e-05, "loss": 1.8786, "step": 541 }, { "epoch": 0.0794604896642721, "grad_norm": 3.231884241104126, "learning_rate": 7.5e-05, "loss": 1.8409, "step": 542 }, { "epoch": 0.07960709573376337, "grad_norm": 0.4067385494709015, "learning_rate": 7.5e-05, "loss": 1.8705, "step": 543 }, { "epoch": 0.07975370180325465, "grad_norm": 0.42014414072036743, "learning_rate": 7.5e-05, "loss": 1.9731, "step": 544 }, { "epoch": 0.07990030787274593, "grad_norm": 0.5196858048439026, "learning_rate": 7.5e-05, "loss": 1.9291, "step": 545 }, { "epoch": 0.0800469139422372, "grad_norm": 0.3748776316642761, "learning_rate": 7.5e-05, "loss": 2.0491, "step": 546 }, { "epoch": 0.08019352001172848, "grad_norm": 0.43794941902160645, "learning_rate": 7.5e-05, "loss": 1.8821, "step": 547 }, { "epoch": 0.08034012608121976, "grad_norm": 0.38674476742744446, "learning_rate": 7.5e-05, "loss": 1.8801, "step": 548 }, { "epoch": 0.08048673215071105, "grad_norm": 0.4209741950035095, "learning_rate": 7.5e-05, "loss": 1.8823, "step": 549 }, { "epoch": 0.08063333822020231, "grad_norm": 0.3726128041744232, "learning_rate": 7.5e-05, "loss": 1.8739, "step": 550 }, { "epoch": 0.0807799442896936, "grad_norm": 0.41400042176246643, "learning_rate": 7.5e-05, "loss": 1.854, "step": 551 }, { "epoch": 0.08092655035918488, "grad_norm": 0.39190444350242615, "learning_rate": 7.5e-05, "loss": 1.8934, "step": 552 }, { "epoch": 0.08107315642867614, "grad_norm": 0.4278356730937958, "learning_rate": 7.5e-05, "loss": 2.0666, "step": 553 }, { "epoch": 0.08121976249816742, "grad_norm": 0.3734931945800781, "learning_rate": 7.5e-05, "loss": 1.8077, "step": 554 }, { "epoch": 0.0813663685676587, "grad_norm": 0.41886061429977417, "learning_rate": 7.5e-05, "loss": 1.7156, "step": 555 }, { "epoch": 0.08151297463714997, "grad_norm": 0.38814809918403625, "learning_rate": 7.5e-05, "loss": 1.8549, "step": 556 }, { "epoch": 0.08165958070664125, "grad_norm": 0.433211088180542, "learning_rate": 7.5e-05, "loss": 1.9052, "step": 557 }, { "epoch": 0.08180618677613254, "grad_norm": 0.409542053937912, "learning_rate": 7.5e-05, "loss": 1.8532, "step": 558 }, { "epoch": 0.0819527928456238, "grad_norm": 0.385759562253952, "learning_rate": 7.5e-05, "loss": 1.9179, "step": 559 }, { "epoch": 0.08209939891511508, "grad_norm": 0.40890440344810486, "learning_rate": 7.5e-05, "loss": 1.7813, "step": 560 }, { "epoch": 0.08224600498460637, "grad_norm": 0.3941624164581299, "learning_rate": 7.5e-05, "loss": 1.8719, "step": 561 }, { "epoch": 0.08239261105409763, "grad_norm": 0.38557136058807373, "learning_rate": 7.5e-05, "loss": 1.8835, "step": 562 }, { "epoch": 0.08253921712358891, "grad_norm": 0.40687742829322815, "learning_rate": 7.5e-05, "loss": 2.036, "step": 563 }, { "epoch": 0.0826858231930802, "grad_norm": 0.43409228324890137, "learning_rate": 7.5e-05, "loss": 2.0392, "step": 564 }, { "epoch": 0.08283242926257146, "grad_norm": 0.4001593589782715, "learning_rate": 7.5e-05, "loss": 1.783, "step": 565 }, { "epoch": 0.08297903533206274, "grad_norm": 0.39926066994667053, "learning_rate": 7.5e-05, "loss": 1.9123, "step": 566 }, { "epoch": 0.08312564140155403, "grad_norm": 0.39239591360092163, "learning_rate": 7.5e-05, "loss": 1.8657, "step": 567 }, { "epoch": 0.08327224747104531, "grad_norm": 0.4209124445915222, "learning_rate": 7.5e-05, "loss": 1.9692, "step": 568 }, { "epoch": 0.08341885354053657, "grad_norm": 0.3991694152355194, "learning_rate": 7.5e-05, "loss": 1.8369, "step": 569 }, { "epoch": 0.08356545961002786, "grad_norm": 0.39924362301826477, "learning_rate": 7.5e-05, "loss": 1.8952, "step": 570 }, { "epoch": 0.08371206567951914, "grad_norm": 0.42516350746154785, "learning_rate": 7.5e-05, "loss": 1.7979, "step": 571 }, { "epoch": 0.0838586717490104, "grad_norm": 0.40779200196266174, "learning_rate": 7.5e-05, "loss": 1.9162, "step": 572 }, { "epoch": 0.08400527781850169, "grad_norm": 0.39514029026031494, "learning_rate": 7.5e-05, "loss": 1.9239, "step": 573 }, { "epoch": 0.08415188388799297, "grad_norm": 0.3728751540184021, "learning_rate": 7.5e-05, "loss": 1.9456, "step": 574 }, { "epoch": 0.08429848995748424, "grad_norm": 0.38808271288871765, "learning_rate": 7.5e-05, "loss": 2.0309, "step": 575 }, { "epoch": 0.08444509602697552, "grad_norm": 0.3882072865962982, "learning_rate": 7.5e-05, "loss": 1.7343, "step": 576 }, { "epoch": 0.0845917020964668, "grad_norm": 0.39505329728126526, "learning_rate": 7.5e-05, "loss": 1.6887, "step": 577 }, { "epoch": 0.08473830816595807, "grad_norm": 0.5232841968536377, "learning_rate": 7.5e-05, "loss": 1.906, "step": 578 }, { "epoch": 0.08488491423544935, "grad_norm": 0.39079710841178894, "learning_rate": 7.5e-05, "loss": 1.6314, "step": 579 }, { "epoch": 0.08503152030494063, "grad_norm": 0.41650664806365967, "learning_rate": 7.5e-05, "loss": 1.5315, "step": 580 }, { "epoch": 0.0851781263744319, "grad_norm": 0.38653385639190674, "learning_rate": 7.5e-05, "loss": 1.8671, "step": 581 }, { "epoch": 0.08532473244392318, "grad_norm": 0.4045540392398834, "learning_rate": 7.5e-05, "loss": 1.8334, "step": 582 }, { "epoch": 0.08547133851341446, "grad_norm": 0.37797269225120544, "learning_rate": 7.5e-05, "loss": 1.7942, "step": 583 }, { "epoch": 0.08561794458290574, "grad_norm": 0.39920181035995483, "learning_rate": 7.5e-05, "loss": 2.0461, "step": 584 }, { "epoch": 0.085764550652397, "grad_norm": 0.38393017649650574, "learning_rate": 7.5e-05, "loss": 1.9166, "step": 585 }, { "epoch": 0.08591115672188829, "grad_norm": 0.3908819556236267, "learning_rate": 7.5e-05, "loss": 1.9372, "step": 586 }, { "epoch": 0.08605776279137957, "grad_norm": 0.37028372287750244, "learning_rate": 7.5e-05, "loss": 1.9263, "step": 587 }, { "epoch": 0.08620436886087084, "grad_norm": 0.4032405614852905, "learning_rate": 7.5e-05, "loss": 1.8114, "step": 588 }, { "epoch": 0.08635097493036212, "grad_norm": 0.36472758650779724, "learning_rate": 7.5e-05, "loss": 1.9293, "step": 589 }, { "epoch": 0.0864975809998534, "grad_norm": 0.3950622081756592, "learning_rate": 7.5e-05, "loss": 1.8194, "step": 590 }, { "epoch": 0.08664418706934467, "grad_norm": 0.3982934057712555, "learning_rate": 7.5e-05, "loss": 1.9545, "step": 591 }, { "epoch": 0.08679079313883595, "grad_norm": 0.43317797780036926, "learning_rate": 7.5e-05, "loss": 1.9407, "step": 592 }, { "epoch": 0.08693739920832723, "grad_norm": 0.38058584928512573, "learning_rate": 7.5e-05, "loss": 1.8824, "step": 593 }, { "epoch": 0.0870840052778185, "grad_norm": 0.3791728615760803, "learning_rate": 7.5e-05, "loss": 1.7262, "step": 594 }, { "epoch": 0.08723061134730978, "grad_norm": 0.37756940722465515, "learning_rate": 7.5e-05, "loss": 1.8705, "step": 595 }, { "epoch": 0.08737721741680106, "grad_norm": 0.40691080689430237, "learning_rate": 7.5e-05, "loss": 1.7155, "step": 596 }, { "epoch": 0.08752382348629233, "grad_norm": 0.4011799097061157, "learning_rate": 7.5e-05, "loss": 1.8496, "step": 597 }, { "epoch": 0.08767042955578361, "grad_norm": 0.38323119282722473, "learning_rate": 7.5e-05, "loss": 1.9287, "step": 598 }, { "epoch": 0.08781703562527489, "grad_norm": 0.3789709508419037, "learning_rate": 7.5e-05, "loss": 1.9925, "step": 599 }, { "epoch": 0.08796364169476616, "grad_norm": 0.4210952818393707, "learning_rate": 7.5e-05, "loss": 2.0831, "step": 600 }, { "epoch": 0.08811024776425744, "grad_norm": 0.3872772753238678, "learning_rate": 7.5e-05, "loss": 1.7837, "step": 601 }, { "epoch": 0.08825685383374872, "grad_norm": 0.3870660364627838, "learning_rate": 7.5e-05, "loss": 1.9908, "step": 602 }, { "epoch": 0.08840345990324, "grad_norm": 0.41966378688812256, "learning_rate": 7.5e-05, "loss": 1.9936, "step": 603 }, { "epoch": 0.08855006597273127, "grad_norm": 0.3772909939289093, "learning_rate": 7.5e-05, "loss": 1.943, "step": 604 }, { "epoch": 0.08869667204222255, "grad_norm": 0.3639369606971741, "learning_rate": 7.5e-05, "loss": 1.8497, "step": 605 }, { "epoch": 0.08884327811171383, "grad_norm": 0.36937981843948364, "learning_rate": 7.5e-05, "loss": 1.5943, "step": 606 }, { "epoch": 0.0889898841812051, "grad_norm": 0.41827526688575745, "learning_rate": 7.5e-05, "loss": 1.7971, "step": 607 }, { "epoch": 0.08913649025069638, "grad_norm": 0.3889007568359375, "learning_rate": 7.5e-05, "loss": 1.7633, "step": 608 }, { "epoch": 0.08928309632018766, "grad_norm": 0.42772775888442993, "learning_rate": 7.5e-05, "loss": 1.7573, "step": 609 }, { "epoch": 0.08942970238967893, "grad_norm": 0.37289828062057495, "learning_rate": 7.5e-05, "loss": 1.805, "step": 610 }, { "epoch": 0.08957630845917021, "grad_norm": 0.4187123477458954, "learning_rate": 7.5e-05, "loss": 2.1241, "step": 611 }, { "epoch": 0.08972291452866149, "grad_norm": 0.396140456199646, "learning_rate": 7.5e-05, "loss": 1.9412, "step": 612 }, { "epoch": 0.08986952059815276, "grad_norm": 0.41694751381874084, "learning_rate": 7.5e-05, "loss": 1.8005, "step": 613 }, { "epoch": 0.09001612666764404, "grad_norm": 0.3956344723701477, "learning_rate": 7.5e-05, "loss": 1.9304, "step": 614 }, { "epoch": 0.09016273273713532, "grad_norm": 0.4033382833003998, "learning_rate": 7.5e-05, "loss": 1.9619, "step": 615 }, { "epoch": 0.09030933880662659, "grad_norm": 0.4084108769893646, "learning_rate": 7.5e-05, "loss": 1.8296, "step": 616 }, { "epoch": 0.09045594487611787, "grad_norm": 0.4304773211479187, "learning_rate": 7.5e-05, "loss": 1.8628, "step": 617 }, { "epoch": 0.09060255094560915, "grad_norm": 0.3894641399383545, "learning_rate": 7.5e-05, "loss": 1.7505, "step": 618 }, { "epoch": 0.09074915701510043, "grad_norm": 0.4113016128540039, "learning_rate": 7.5e-05, "loss": 1.8749, "step": 619 }, { "epoch": 0.0908957630845917, "grad_norm": 0.3927186131477356, "learning_rate": 7.5e-05, "loss": 1.6524, "step": 620 }, { "epoch": 0.09104236915408298, "grad_norm": 0.39804017543792725, "learning_rate": 7.5e-05, "loss": 1.9939, "step": 621 }, { "epoch": 0.09118897522357426, "grad_norm": 0.4259248673915863, "learning_rate": 7.5e-05, "loss": 1.832, "step": 622 }, { "epoch": 0.09133558129306553, "grad_norm": 0.3872550427913666, "learning_rate": 7.5e-05, "loss": 2.0034, "step": 623 }, { "epoch": 0.09148218736255681, "grad_norm": 0.4067636728286743, "learning_rate": 7.5e-05, "loss": 1.9134, "step": 624 }, { "epoch": 0.09162879343204809, "grad_norm": 0.3727567195892334, "learning_rate": 7.5e-05, "loss": 1.8517, "step": 625 }, { "epoch": 0.09177539950153936, "grad_norm": 0.42755040526390076, "learning_rate": 7.5e-05, "loss": 1.8007, "step": 626 }, { "epoch": 0.09192200557103064, "grad_norm": 0.42271697521209717, "learning_rate": 7.5e-05, "loss": 1.8911, "step": 627 }, { "epoch": 0.09206861164052192, "grad_norm": 0.3935420513153076, "learning_rate": 7.5e-05, "loss": 1.735, "step": 628 }, { "epoch": 0.09221521771001319, "grad_norm": 0.39243394136428833, "learning_rate": 7.5e-05, "loss": 1.9276, "step": 629 }, { "epoch": 0.09236182377950447, "grad_norm": 0.37158825993537903, "learning_rate": 7.5e-05, "loss": 2.0401, "step": 630 }, { "epoch": 0.09250842984899575, "grad_norm": 0.3763003349304199, "learning_rate": 7.5e-05, "loss": 1.7529, "step": 631 }, { "epoch": 0.09265503591848702, "grad_norm": 0.40861958265304565, "learning_rate": 7.5e-05, "loss": 1.7536, "step": 632 }, { "epoch": 0.0928016419879783, "grad_norm": 0.3832313120365143, "learning_rate": 7.5e-05, "loss": 1.9921, "step": 633 }, { "epoch": 0.09294824805746958, "grad_norm": 0.39033326506614685, "learning_rate": 7.5e-05, "loss": 1.9047, "step": 634 }, { "epoch": 0.09309485412696085, "grad_norm": 0.43468335270881653, "learning_rate": 7.5e-05, "loss": 1.7455, "step": 635 }, { "epoch": 0.09324146019645213, "grad_norm": 0.3919393718242645, "learning_rate": 7.5e-05, "loss": 2.0568, "step": 636 }, { "epoch": 0.09338806626594341, "grad_norm": 0.43274378776550293, "learning_rate": 7.5e-05, "loss": 1.7461, "step": 637 }, { "epoch": 0.0935346723354347, "grad_norm": 0.41652268171310425, "learning_rate": 7.5e-05, "loss": 1.777, "step": 638 }, { "epoch": 0.09368127840492596, "grad_norm": 0.4154978394508362, "learning_rate": 7.5e-05, "loss": 1.8349, "step": 639 }, { "epoch": 0.09382788447441724, "grad_norm": 0.38528379797935486, "learning_rate": 7.5e-05, "loss": 1.9623, "step": 640 }, { "epoch": 0.09397449054390852, "grad_norm": 0.391456663608551, "learning_rate": 7.5e-05, "loss": 1.7175, "step": 641 }, { "epoch": 0.09412109661339979, "grad_norm": 0.40258175134658813, "learning_rate": 7.5e-05, "loss": 1.9281, "step": 642 }, { "epoch": 0.09426770268289107, "grad_norm": 0.40201902389526367, "learning_rate": 7.5e-05, "loss": 1.7, "step": 643 }, { "epoch": 0.09441430875238235, "grad_norm": 0.38065358996391296, "learning_rate": 7.5e-05, "loss": 2.0016, "step": 644 }, { "epoch": 0.09456091482187362, "grad_norm": 0.3814140856266022, "learning_rate": 7.5e-05, "loss": 1.9744, "step": 645 }, { "epoch": 0.0947075208913649, "grad_norm": 0.39547398686408997, "learning_rate": 7.5e-05, "loss": 1.9413, "step": 646 }, { "epoch": 0.09485412696085618, "grad_norm": 0.39026615023612976, "learning_rate": 7.5e-05, "loss": 1.7949, "step": 647 }, { "epoch": 0.09500073303034745, "grad_norm": 0.404502809047699, "learning_rate": 7.5e-05, "loss": 2.1678, "step": 648 }, { "epoch": 0.09514733909983873, "grad_norm": 0.4099234640598297, "learning_rate": 7.5e-05, "loss": 1.8357, "step": 649 }, { "epoch": 0.09529394516933001, "grad_norm": 0.3969046473503113, "learning_rate": 7.5e-05, "loss": 1.7738, "step": 650 }, { "epoch": 0.09544055123882128, "grad_norm": 0.3875387907028198, "learning_rate": 7.5e-05, "loss": 1.6994, "step": 651 }, { "epoch": 0.09558715730831256, "grad_norm": 0.38160914182662964, "learning_rate": 7.5e-05, "loss": 1.711, "step": 652 }, { "epoch": 0.09573376337780384, "grad_norm": 0.40455079078674316, "learning_rate": 7.5e-05, "loss": 1.7128, "step": 653 }, { "epoch": 0.09588036944729512, "grad_norm": 0.3829095959663391, "learning_rate": 7.5e-05, "loss": 1.7725, "step": 654 }, { "epoch": 0.09602697551678639, "grad_norm": 0.4122965335845947, "learning_rate": 7.5e-05, "loss": 1.8145, "step": 655 }, { "epoch": 0.09617358158627767, "grad_norm": 0.3711942732334137, "learning_rate": 7.5e-05, "loss": 1.8337, "step": 656 }, { "epoch": 0.09632018765576895, "grad_norm": 0.406998872756958, "learning_rate": 7.5e-05, "loss": 1.9412, "step": 657 }, { "epoch": 0.09646679372526022, "grad_norm": 0.4287692606449127, "learning_rate": 7.5e-05, "loss": 1.8195, "step": 658 }, { "epoch": 0.0966133997947515, "grad_norm": 0.4041669964790344, "learning_rate": 7.5e-05, "loss": 1.8721, "step": 659 }, { "epoch": 0.09676000586424278, "grad_norm": 0.39231258630752563, "learning_rate": 7.5e-05, "loss": 1.862, "step": 660 }, { "epoch": 0.09690661193373405, "grad_norm": 0.42005816102027893, "learning_rate": 7.5e-05, "loss": 1.7069, "step": 661 }, { "epoch": 0.09705321800322533, "grad_norm": 0.3739244043827057, "learning_rate": 7.5e-05, "loss": 1.9296, "step": 662 }, { "epoch": 0.09719982407271661, "grad_norm": 0.4213484227657318, "learning_rate": 7.5e-05, "loss": 1.8825, "step": 663 }, { "epoch": 0.09734643014220788, "grad_norm": 0.4079893231391907, "learning_rate": 7.5e-05, "loss": 1.7744, "step": 664 }, { "epoch": 0.09749303621169916, "grad_norm": 0.39637288451194763, "learning_rate": 7.5e-05, "loss": 1.7517, "step": 665 }, { "epoch": 0.09763964228119044, "grad_norm": 0.3926975429058075, "learning_rate": 7.5e-05, "loss": 1.8767, "step": 666 }, { "epoch": 0.09778624835068171, "grad_norm": 0.43108201026916504, "learning_rate": 7.5e-05, "loss": 1.9279, "step": 667 }, { "epoch": 0.097932854420173, "grad_norm": 0.4032636284828186, "learning_rate": 7.5e-05, "loss": 1.9113, "step": 668 }, { "epoch": 0.09807946048966427, "grad_norm": 0.3850323259830475, "learning_rate": 7.5e-05, "loss": 1.8418, "step": 669 }, { "epoch": 0.09822606655915556, "grad_norm": 0.379642128944397, "learning_rate": 7.5e-05, "loss": 1.7567, "step": 670 }, { "epoch": 0.09837267262864682, "grad_norm": 0.3865058124065399, "learning_rate": 7.5e-05, "loss": 1.9215, "step": 671 }, { "epoch": 0.0985192786981381, "grad_norm": 0.37513697147369385, "learning_rate": 7.5e-05, "loss": 1.767, "step": 672 }, { "epoch": 0.09866588476762939, "grad_norm": 0.37311625480651855, "learning_rate": 7.5e-05, "loss": 1.7898, "step": 673 }, { "epoch": 0.09881249083712065, "grad_norm": 0.42235642671585083, "learning_rate": 7.5e-05, "loss": 1.8225, "step": 674 }, { "epoch": 0.09895909690661193, "grad_norm": 0.4208980202674866, "learning_rate": 7.5e-05, "loss": 1.9818, "step": 675 }, { "epoch": 0.09910570297610322, "grad_norm": 0.4288448095321655, "learning_rate": 7.5e-05, "loss": 1.9721, "step": 676 }, { "epoch": 0.09925230904559448, "grad_norm": 0.3728920519351959, "learning_rate": 7.5e-05, "loss": 1.9822, "step": 677 }, { "epoch": 0.09939891511508576, "grad_norm": 0.3833412528038025, "learning_rate": 7.5e-05, "loss": 1.9363, "step": 678 }, { "epoch": 0.09954552118457705, "grad_norm": 0.38505759835243225, "learning_rate": 7.5e-05, "loss": 1.7646, "step": 679 }, { "epoch": 0.09969212725406831, "grad_norm": 0.3725159466266632, "learning_rate": 7.5e-05, "loss": 1.964, "step": 680 }, { "epoch": 0.0998387333235596, "grad_norm": 0.3944285213947296, "learning_rate": 7.5e-05, "loss": 2.0458, "step": 681 }, { "epoch": 0.09998533939305088, "grad_norm": 0.3764898478984833, "learning_rate": 7.5e-05, "loss": 1.8423, "step": 682 }, { "epoch": 0.10013194546254214, "grad_norm": 0.36613914370536804, "learning_rate": 7.5e-05, "loss": 1.8674, "step": 683 }, { "epoch": 0.10027855153203342, "grad_norm": 0.3861442804336548, "learning_rate": 7.5e-05, "loss": 1.9844, "step": 684 }, { "epoch": 0.1004251576015247, "grad_norm": 0.39129993319511414, "learning_rate": 7.5e-05, "loss": 1.8762, "step": 685 }, { "epoch": 0.10057176367101597, "grad_norm": 0.4014522433280945, "learning_rate": 7.5e-05, "loss": 1.8899, "step": 686 }, { "epoch": 0.10071836974050725, "grad_norm": 0.3749368190765381, "learning_rate": 7.5e-05, "loss": 1.9037, "step": 687 }, { "epoch": 0.10086497580999854, "grad_norm": 0.38213297724723816, "learning_rate": 7.5e-05, "loss": 1.7847, "step": 688 }, { "epoch": 0.10101158187948982, "grad_norm": 0.40072527527809143, "learning_rate": 7.5e-05, "loss": 1.7941, "step": 689 }, { "epoch": 0.10115818794898108, "grad_norm": 0.4083983302116394, "learning_rate": 7.5e-05, "loss": 1.8012, "step": 690 }, { "epoch": 0.10130479401847237, "grad_norm": 0.39744943380355835, "learning_rate": 7.5e-05, "loss": 1.8296, "step": 691 }, { "epoch": 0.10145140008796365, "grad_norm": 0.40678736567497253, "learning_rate": 7.5e-05, "loss": 1.8198, "step": 692 }, { "epoch": 0.10159800615745491, "grad_norm": 0.371538907289505, "learning_rate": 7.5e-05, "loss": 1.8323, "step": 693 }, { "epoch": 0.1017446122269462, "grad_norm": 0.3873870372772217, "learning_rate": 7.5e-05, "loss": 1.8217, "step": 694 }, { "epoch": 0.10189121829643748, "grad_norm": 0.3915213346481323, "learning_rate": 7.5e-05, "loss": 1.6817, "step": 695 }, { "epoch": 0.10203782436592874, "grad_norm": 0.3969807028770447, "learning_rate": 7.5e-05, "loss": 1.8994, "step": 696 }, { "epoch": 0.10218443043542003, "grad_norm": 0.39391449093818665, "learning_rate": 7.5e-05, "loss": 1.8555, "step": 697 }, { "epoch": 0.10233103650491131, "grad_norm": 0.39644506573677063, "learning_rate": 7.5e-05, "loss": 1.8253, "step": 698 }, { "epoch": 0.10247764257440257, "grad_norm": 0.42519712448120117, "learning_rate": 7.5e-05, "loss": 2.0367, "step": 699 }, { "epoch": 0.10262424864389386, "grad_norm": 0.3949500322341919, "learning_rate": 7.5e-05, "loss": 1.9123, "step": 700 }, { "epoch": 0.10277085471338514, "grad_norm": 0.3809478282928467, "learning_rate": 7.5e-05, "loss": 1.8989, "step": 701 }, { "epoch": 0.1029174607828764, "grad_norm": 0.4152654707431793, "learning_rate": 7.5e-05, "loss": 2.0191, "step": 702 }, { "epoch": 0.10306406685236769, "grad_norm": 0.36832574009895325, "learning_rate": 7.5e-05, "loss": 1.9237, "step": 703 }, { "epoch": 0.10321067292185897, "grad_norm": 0.39915865659713745, "learning_rate": 7.5e-05, "loss": 1.8732, "step": 704 }, { "epoch": 0.10335727899135025, "grad_norm": 0.3648003935813904, "learning_rate": 7.5e-05, "loss": 1.6534, "step": 705 }, { "epoch": 0.10350388506084152, "grad_norm": 0.4137917757034302, "learning_rate": 7.5e-05, "loss": 1.9312, "step": 706 }, { "epoch": 0.1036504911303328, "grad_norm": 0.41573524475097656, "learning_rate": 7.5e-05, "loss": 1.9367, "step": 707 }, { "epoch": 0.10379709719982408, "grad_norm": 0.41094300150871277, "learning_rate": 7.5e-05, "loss": 1.6989, "step": 708 }, { "epoch": 0.10394370326931535, "grad_norm": 0.3570743799209595, "learning_rate": 7.5e-05, "loss": 1.7609, "step": 709 }, { "epoch": 0.10409030933880663, "grad_norm": 0.36455729603767395, "learning_rate": 7.5e-05, "loss": 1.6775, "step": 710 }, { "epoch": 0.10423691540829791, "grad_norm": 0.43082520365715027, "learning_rate": 7.5e-05, "loss": 1.8289, "step": 711 }, { "epoch": 0.10438352147778918, "grad_norm": 0.3986465334892273, "learning_rate": 7.5e-05, "loss": 1.8826, "step": 712 }, { "epoch": 0.10453012754728046, "grad_norm": 0.3945043683052063, "learning_rate": 7.5e-05, "loss": 1.6189, "step": 713 }, { "epoch": 0.10467673361677174, "grad_norm": 0.3847092092037201, "learning_rate": 7.5e-05, "loss": 1.7825, "step": 714 }, { "epoch": 0.104823339686263, "grad_norm": 0.3838895857334137, "learning_rate": 7.5e-05, "loss": 1.7012, "step": 715 }, { "epoch": 0.10496994575575429, "grad_norm": 0.38608986139297485, "learning_rate": 7.5e-05, "loss": 1.7084, "step": 716 }, { "epoch": 0.10511655182524557, "grad_norm": 0.3869848847389221, "learning_rate": 7.5e-05, "loss": 1.7189, "step": 717 }, { "epoch": 0.10526315789473684, "grad_norm": 0.39928188920021057, "learning_rate": 7.5e-05, "loss": 1.8177, "step": 718 }, { "epoch": 0.10540976396422812, "grad_norm": 0.3825145959854126, "learning_rate": 7.5e-05, "loss": 1.7667, "step": 719 }, { "epoch": 0.1055563700337194, "grad_norm": 0.3924807012081146, "learning_rate": 7.5e-05, "loss": 1.8717, "step": 720 }, { "epoch": 0.10570297610321067, "grad_norm": 0.4087030589580536, "learning_rate": 7.5e-05, "loss": 1.9041, "step": 721 }, { "epoch": 0.10584958217270195, "grad_norm": 0.40090781450271606, "learning_rate": 7.5e-05, "loss": 1.7806, "step": 722 }, { "epoch": 0.10599618824219323, "grad_norm": 0.39495033025741577, "learning_rate": 7.5e-05, "loss": 1.8166, "step": 723 }, { "epoch": 0.10614279431168451, "grad_norm": 0.3792243003845215, "learning_rate": 7.5e-05, "loss": 1.789, "step": 724 }, { "epoch": 0.10628940038117578, "grad_norm": 0.4169251024723053, "learning_rate": 7.5e-05, "loss": 1.8308, "step": 725 }, { "epoch": 0.10643600645066706, "grad_norm": 0.37063291668891907, "learning_rate": 7.5e-05, "loss": 1.9335, "step": 726 }, { "epoch": 0.10658261252015834, "grad_norm": 0.38010549545288086, "learning_rate": 7.5e-05, "loss": 1.6441, "step": 727 }, { "epoch": 0.10672921858964961, "grad_norm": 0.3695436418056488, "learning_rate": 7.5e-05, "loss": 1.7203, "step": 728 }, { "epoch": 0.10687582465914089, "grad_norm": 0.3989785611629486, "learning_rate": 7.5e-05, "loss": 1.7305, "step": 729 }, { "epoch": 0.10702243072863217, "grad_norm": 0.42956942319869995, "learning_rate": 7.5e-05, "loss": 1.728, "step": 730 }, { "epoch": 0.10716903679812344, "grad_norm": 0.3977014720439911, "learning_rate": 7.5e-05, "loss": 1.8549, "step": 731 }, { "epoch": 0.10731564286761472, "grad_norm": 0.3871323764324188, "learning_rate": 7.5e-05, "loss": 1.8759, "step": 732 }, { "epoch": 0.107462248937106, "grad_norm": 0.3838559687137604, "learning_rate": 7.5e-05, "loss": 1.6979, "step": 733 }, { "epoch": 0.10760885500659727, "grad_norm": 0.38157227635383606, "learning_rate": 7.5e-05, "loss": 1.8511, "step": 734 }, { "epoch": 0.10775546107608855, "grad_norm": 0.363411009311676, "learning_rate": 7.5e-05, "loss": 1.8047, "step": 735 }, { "epoch": 0.10790206714557983, "grad_norm": 0.37893426418304443, "learning_rate": 7.5e-05, "loss": 1.8166, "step": 736 }, { "epoch": 0.1080486732150711, "grad_norm": 0.3851270079612732, "learning_rate": 7.5e-05, "loss": 1.6866, "step": 737 }, { "epoch": 0.10819527928456238, "grad_norm": 0.4072217643260956, "learning_rate": 7.5e-05, "loss": 1.8352, "step": 738 }, { "epoch": 0.10834188535405366, "grad_norm": 0.359017550945282, "learning_rate": 7.5e-05, "loss": 1.7118, "step": 739 }, { "epoch": 0.10848849142354494, "grad_norm": 0.3772314786911011, "learning_rate": 7.5e-05, "loss": 2.0069, "step": 740 }, { "epoch": 0.10863509749303621, "grad_norm": 0.378547340631485, "learning_rate": 7.5e-05, "loss": 1.8416, "step": 741 }, { "epoch": 0.10878170356252749, "grad_norm": 0.4195118844509125, "learning_rate": 7.5e-05, "loss": 1.9616, "step": 742 }, { "epoch": 0.10892830963201877, "grad_norm": 0.40370646119117737, "learning_rate": 7.5e-05, "loss": 1.8194, "step": 743 }, { "epoch": 0.10907491570151004, "grad_norm": 0.37147194147109985, "learning_rate": 7.5e-05, "loss": 1.9617, "step": 744 }, { "epoch": 0.10922152177100132, "grad_norm": 0.38484954833984375, "learning_rate": 7.5e-05, "loss": 1.6992, "step": 745 }, { "epoch": 0.1093681278404926, "grad_norm": 0.42167386412620544, "learning_rate": 7.5e-05, "loss": 1.963, "step": 746 }, { "epoch": 0.10951473390998387, "grad_norm": 0.3640221953392029, "learning_rate": 7.5e-05, "loss": 1.8328, "step": 747 }, { "epoch": 0.10966133997947515, "grad_norm": 0.3925817608833313, "learning_rate": 7.5e-05, "loss": 1.7569, "step": 748 }, { "epoch": 0.10980794604896643, "grad_norm": 0.4130125641822815, "learning_rate": 7.5e-05, "loss": 1.8332, "step": 749 }, { "epoch": 0.1099545521184577, "grad_norm": 0.4056004583835602, "learning_rate": 7.5e-05, "loss": 1.8698, "step": 750 }, { "epoch": 0.11010115818794898, "grad_norm": 0.399223655462265, "learning_rate": 7.5e-05, "loss": 1.7726, "step": 751 }, { "epoch": 0.11024776425744026, "grad_norm": 0.4298253357410431, "learning_rate": 7.5e-05, "loss": 1.787, "step": 752 }, { "epoch": 0.11039437032693153, "grad_norm": 0.4073062539100647, "learning_rate": 7.5e-05, "loss": 2.0053, "step": 753 }, { "epoch": 0.11054097639642281, "grad_norm": 0.3919924199581146, "learning_rate": 7.5e-05, "loss": 1.9506, "step": 754 }, { "epoch": 0.11068758246591409, "grad_norm": 0.37436777353286743, "learning_rate": 7.5e-05, "loss": 1.7747, "step": 755 }, { "epoch": 0.11083418853540536, "grad_norm": 0.3917541205883026, "learning_rate": 7.5e-05, "loss": 1.8015, "step": 756 }, { "epoch": 0.11098079460489664, "grad_norm": 0.39283812046051025, "learning_rate": 7.5e-05, "loss": 1.8865, "step": 757 }, { "epoch": 0.11112740067438792, "grad_norm": 0.3837577998638153, "learning_rate": 7.5e-05, "loss": 1.9141, "step": 758 }, { "epoch": 0.1112740067438792, "grad_norm": 0.39854320883750916, "learning_rate": 7.5e-05, "loss": 1.8501, "step": 759 }, { "epoch": 0.11142061281337047, "grad_norm": 0.3995814025402069, "learning_rate": 7.5e-05, "loss": 1.754, "step": 760 }, { "epoch": 0.11156721888286175, "grad_norm": 0.38571494817733765, "learning_rate": 7.5e-05, "loss": 1.5816, "step": 761 }, { "epoch": 0.11171382495235303, "grad_norm": 0.37821289896965027, "learning_rate": 7.5e-05, "loss": 1.9295, "step": 762 }, { "epoch": 0.1118604310218443, "grad_norm": 0.4166487455368042, "learning_rate": 7.5e-05, "loss": 1.9182, "step": 763 }, { "epoch": 0.11200703709133558, "grad_norm": 0.37824222445487976, "learning_rate": 7.5e-05, "loss": 1.7926, "step": 764 }, { "epoch": 0.11215364316082686, "grad_norm": 0.3810991048812866, "learning_rate": 7.5e-05, "loss": 1.8399, "step": 765 }, { "epoch": 0.11230024923031813, "grad_norm": 0.3977545201778412, "learning_rate": 7.5e-05, "loss": 1.86, "step": 766 }, { "epoch": 0.11244685529980941, "grad_norm": 0.4048866629600525, "learning_rate": 7.5e-05, "loss": 1.9284, "step": 767 }, { "epoch": 0.11259346136930069, "grad_norm": 0.42181819677352905, "learning_rate": 7.5e-05, "loss": 1.967, "step": 768 }, { "epoch": 0.11274006743879196, "grad_norm": 0.3790417015552521, "learning_rate": 7.5e-05, "loss": 1.7471, "step": 769 }, { "epoch": 0.11288667350828324, "grad_norm": 0.3942205011844635, "learning_rate": 7.5e-05, "loss": 1.7824, "step": 770 }, { "epoch": 0.11303327957777452, "grad_norm": 0.4267055094242096, "learning_rate": 7.5e-05, "loss": 1.8923, "step": 771 }, { "epoch": 0.11317988564726579, "grad_norm": 0.398407518863678, "learning_rate": 7.5e-05, "loss": 1.6828, "step": 772 }, { "epoch": 0.11332649171675707, "grad_norm": 0.41717132925987244, "learning_rate": 7.5e-05, "loss": 1.866, "step": 773 }, { "epoch": 0.11347309778624835, "grad_norm": 0.40246257185935974, "learning_rate": 7.5e-05, "loss": 2.0718, "step": 774 }, { "epoch": 0.11361970385573963, "grad_norm": 0.36798885464668274, "learning_rate": 7.5e-05, "loss": 1.8446, "step": 775 }, { "epoch": 0.1137663099252309, "grad_norm": 0.39077189564704895, "learning_rate": 7.5e-05, "loss": 1.9068, "step": 776 }, { "epoch": 0.11391291599472218, "grad_norm": 0.39612439274787903, "learning_rate": 7.5e-05, "loss": 1.8593, "step": 777 }, { "epoch": 0.11405952206421346, "grad_norm": 0.3740413784980774, "learning_rate": 7.5e-05, "loss": 1.8783, "step": 778 }, { "epoch": 0.11420612813370473, "grad_norm": 0.4108491539955139, "learning_rate": 7.5e-05, "loss": 1.7334, "step": 779 }, { "epoch": 0.11435273420319601, "grad_norm": 0.39279866218566895, "learning_rate": 7.5e-05, "loss": 1.7506, "step": 780 }, { "epoch": 0.1144993402726873, "grad_norm": 0.40449056029319763, "learning_rate": 7.5e-05, "loss": 1.7506, "step": 781 }, { "epoch": 0.11464594634217856, "grad_norm": 0.4172016978263855, "learning_rate": 7.5e-05, "loss": 1.8035, "step": 782 }, { "epoch": 0.11479255241166984, "grad_norm": 0.37189945578575134, "learning_rate": 7.5e-05, "loss": 1.83, "step": 783 }, { "epoch": 0.11493915848116112, "grad_norm": 0.4685274660587311, "learning_rate": 7.5e-05, "loss": 1.9008, "step": 784 }, { "epoch": 0.11508576455065239, "grad_norm": 0.3798149526119232, "learning_rate": 7.5e-05, "loss": 1.7448, "step": 785 }, { "epoch": 0.11523237062014367, "grad_norm": 0.399687260389328, "learning_rate": 7.5e-05, "loss": 1.8747, "step": 786 }, { "epoch": 0.11537897668963495, "grad_norm": 0.3818066418170929, "learning_rate": 7.5e-05, "loss": 1.7734, "step": 787 }, { "epoch": 0.11552558275912622, "grad_norm": 0.3742682933807373, "learning_rate": 7.5e-05, "loss": 1.8579, "step": 788 }, { "epoch": 0.1156721888286175, "grad_norm": 0.4148941934108734, "learning_rate": 7.5e-05, "loss": 1.915, "step": 789 }, { "epoch": 0.11581879489810878, "grad_norm": 0.42194920778274536, "learning_rate": 7.5e-05, "loss": 1.8239, "step": 790 }, { "epoch": 0.11596540096760005, "grad_norm": 0.39349424839019775, "learning_rate": 7.5e-05, "loss": 1.8559, "step": 791 }, { "epoch": 0.11611200703709133, "grad_norm": 0.3876720070838928, "learning_rate": 7.5e-05, "loss": 1.8592, "step": 792 }, { "epoch": 0.11625861310658261, "grad_norm": 0.3978204131126404, "learning_rate": 7.5e-05, "loss": 1.9137, "step": 793 }, { "epoch": 0.1164052191760739, "grad_norm": 0.3648659586906433, "learning_rate": 7.5e-05, "loss": 1.6602, "step": 794 }, { "epoch": 0.11655182524556516, "grad_norm": 0.3913882374763489, "learning_rate": 7.5e-05, "loss": 1.7843, "step": 795 }, { "epoch": 0.11669843131505644, "grad_norm": 0.36403462290763855, "learning_rate": 7.5e-05, "loss": 1.8305, "step": 796 }, { "epoch": 0.11684503738454773, "grad_norm": 0.38562655448913574, "learning_rate": 7.5e-05, "loss": 1.7028, "step": 797 }, { "epoch": 0.116991643454039, "grad_norm": 0.380679190158844, "learning_rate": 7.5e-05, "loss": 1.8858, "step": 798 }, { "epoch": 0.11713824952353027, "grad_norm": 0.4230104386806488, "learning_rate": 7.5e-05, "loss": 1.8208, "step": 799 }, { "epoch": 0.11728485559302156, "grad_norm": 0.3927924335002899, "learning_rate": 7.5e-05, "loss": 1.554, "step": 800 }, { "epoch": 0.11728485559302156, "eval_loss": 1.839439034461975, "eval_runtime": 41.5483, "eval_samples_per_second": 13.214, "eval_steps_per_second": 6.619, "step": 800 } ], "logging_steps": 1.0, "max_steps": 6821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.376799147655168e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }