finetuned_models/llama2_muying_200e/trainer_state.json

2082 lines
48 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 181.8181818181818,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.91,
"eval_loss": 1.7933429479599,
"eval_runtime": 0.607,
"eval_samples_per_second": 289.962,
"eval_steps_per_second": 9.885,
"step": 5
},
{
"epoch": 1.82,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.8069,
"step": 10
},
{
"epoch": 2.0,
"eval_loss": 1.7829440832138062,
"eval_runtime": 0.6089,
"eval_samples_per_second": 289.045,
"eval_steps_per_second": 9.854,
"step": 11
},
{
"epoch": 2.91,
"eval_loss": 1.7655202150344849,
"eval_runtime": 0.6114,
"eval_samples_per_second": 287.852,
"eval_steps_per_second": 9.813,
"step": 16
},
{
"epoch": 3.64,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.7929,
"step": 20
},
{
"epoch": 4.0,
"eval_loss": 1.7314362525939941,
"eval_runtime": 0.6066,
"eval_samples_per_second": 290.12,
"eval_steps_per_second": 9.89,
"step": 22
},
{
"epoch": 4.91,
"eval_loss": 1.687442421913147,
"eval_runtime": 0.6137,
"eval_samples_per_second": 286.793,
"eval_steps_per_second": 9.777,
"step": 27
},
{
"epoch": 5.45,
"learning_rate": 5e-05,
"loss": 1.7324,
"step": 30
},
{
"epoch": 6.0,
"eval_loss": 1.6319729089736938,
"eval_runtime": 0.6106,
"eval_samples_per_second": 288.249,
"eval_steps_per_second": 9.827,
"step": 33
},
{
"epoch": 6.91,
"eval_loss": 1.6018821001052856,
"eval_runtime": 0.6146,
"eval_samples_per_second": 286.357,
"eval_steps_per_second": 9.762,
"step": 38
},
{
"epoch": 7.27,
"learning_rate": 4.998688922613788e-05,
"loss": 1.6419,
"step": 40
},
{
"epoch": 8.0,
"eval_loss": 1.5704588890075684,
"eval_runtime": 0.6099,
"eval_samples_per_second": 288.582,
"eval_steps_per_second": 9.838,
"step": 44
},
{
"epoch": 8.91,
"eval_loss": 1.5536445379257202,
"eval_runtime": 0.6172,
"eval_samples_per_second": 285.154,
"eval_steps_per_second": 9.721,
"step": 49
},
{
"epoch": 9.09,
"learning_rate": 4.9947570655942796e-05,
"loss": 1.5731,
"step": 50
},
{
"epoch": 10.0,
"eval_loss": 1.5355907678604126,
"eval_runtime": 0.6124,
"eval_samples_per_second": 287.408,
"eval_steps_per_second": 9.798,
"step": 55
},
{
"epoch": 10.91,
"learning_rate": 4.988208552916535e-05,
"loss": 1.5498,
"step": 60
},
{
"epoch": 10.91,
"eval_loss": 1.5225105285644531,
"eval_runtime": 0.6148,
"eval_samples_per_second": 286.264,
"eval_steps_per_second": 9.759,
"step": 60
},
{
"epoch": 12.0,
"eval_loss": 1.509987235069275,
"eval_runtime": 0.6161,
"eval_samples_per_second": 285.65,
"eval_steps_per_second": 9.738,
"step": 66
},
{
"epoch": 12.73,
"learning_rate": 4.9790502530660635e-05,
"loss": 1.5188,
"step": 70
},
{
"epoch": 12.91,
"eval_loss": 1.5007643699645996,
"eval_runtime": 0.6136,
"eval_samples_per_second": 286.848,
"eval_steps_per_second": 9.779,
"step": 71
},
{
"epoch": 14.0,
"eval_loss": 1.4901142120361328,
"eval_runtime": 0.6114,
"eval_samples_per_second": 287.851,
"eval_steps_per_second": 9.813,
"step": 77
},
{
"epoch": 14.55,
"learning_rate": 4.967291771834727e-05,
"loss": 1.5005,
"step": 80
},
{
"epoch": 14.91,
"eval_loss": 1.481990098953247,
"eval_runtime": 0.6166,
"eval_samples_per_second": 285.454,
"eval_steps_per_second": 9.731,
"step": 82
},
{
"epoch": 16.0,
"eval_loss": 1.4730418920516968,
"eval_runtime": 0.6164,
"eval_samples_per_second": 285.532,
"eval_steps_per_second": 9.734,
"step": 88
},
{
"epoch": 16.36,
"learning_rate": 4.9529454422455976e-05,
"loss": 1.4814,
"step": 90
},
{
"epoch": 16.91,
"eval_loss": 1.4652411937713623,
"eval_runtime": 0.6742,
"eval_samples_per_second": 261.038,
"eval_steps_per_second": 8.899,
"step": 93
},
{
"epoch": 18.0,
"eval_loss": 1.4562301635742188,
"eval_runtime": 0.6889,
"eval_samples_per_second": 255.479,
"eval_steps_per_second": 8.71,
"step": 99
},
{
"epoch": 18.18,
"learning_rate": 4.936026311617316e-05,
"loss": 1.4618,
"step": 100
},
{
"epoch": 18.91,
"eval_loss": 1.4488506317138672,
"eval_runtime": 0.6201,
"eval_samples_per_second": 283.827,
"eval_steps_per_second": 9.676,
"step": 104
},
{
"epoch": 20.0,
"learning_rate": 4.916552125781528e-05,
"loss": 1.4468,
"step": 110
},
{
"epoch": 20.0,
"eval_loss": 1.4396686553955078,
"eval_runtime": 0.6087,
"eval_samples_per_second": 289.16,
"eval_steps_per_second": 9.858,
"step": 110
},
{
"epoch": 20.91,
"eval_loss": 1.4325110912322998,
"eval_runtime": 0.6159,
"eval_samples_per_second": 285.768,
"eval_steps_per_second": 9.742,
"step": 115
},
{
"epoch": 21.82,
"learning_rate": 4.894543310469968e-05,
"loss": 1.4198,
"step": 120
},
{
"epoch": 22.0,
"eval_loss": 1.4230388402938843,
"eval_runtime": 0.6122,
"eval_samples_per_second": 287.508,
"eval_steps_per_second": 9.801,
"step": 121
},
{
"epoch": 22.91,
"eval_loss": 1.4148945808410645,
"eval_runtime": 0.6173,
"eval_samples_per_second": 285.131,
"eval_steps_per_second": 9.72,
"step": 126
},
{
"epoch": 23.64,
"learning_rate": 4.870022949890676e-05,
"loss": 1.4001,
"step": 130
},
{
"epoch": 24.0,
"eval_loss": 1.4044132232666016,
"eval_runtime": 0.6099,
"eval_samples_per_second": 288.552,
"eval_steps_per_second": 9.837,
"step": 132
},
{
"epoch": 24.91,
"eval_loss": 1.3964232206344604,
"eval_runtime": 0.6151,
"eval_samples_per_second": 286.143,
"eval_steps_per_second": 9.755,
"step": 137
},
{
"epoch": 25.45,
"learning_rate": 4.8430167625158595e-05,
"loss": 1.3809,
"step": 140
},
{
"epoch": 26.0,
"eval_loss": 1.3872698545455933,
"eval_runtime": 0.6134,
"eval_samples_per_second": 286.947,
"eval_steps_per_second": 9.782,
"step": 143
},
{
"epoch": 26.91,
"eval_loss": 1.3796738386154175,
"eval_runtime": 0.6136,
"eval_samples_per_second": 286.826,
"eval_steps_per_second": 9.778,
"step": 148
},
{
"epoch": 27.27,
"learning_rate": 4.813553074106761e-05,
"loss": 1.3763,
"step": 150
},
{
"epoch": 28.0,
"eval_loss": 1.3704036474227905,
"eval_runtime": 0.6104,
"eval_samples_per_second": 288.339,
"eval_steps_per_second": 9.83,
"step": 154
},
{
"epoch": 28.91,
"eval_loss": 1.3635640144348145,
"eval_runtime": 0.6147,
"eval_samples_per_second": 286.317,
"eval_steps_per_second": 9.761,
"step": 159
},
{
"epoch": 29.09,
"learning_rate": 4.781662788003851e-05,
"loss": 1.3428,
"step": 160
},
{
"epoch": 30.0,
"eval_loss": 1.352755069732666,
"eval_runtime": 0.6087,
"eval_samples_per_second": 289.163,
"eval_steps_per_second": 9.858,
"step": 165
},
{
"epoch": 30.91,
"learning_rate": 4.747379352713489e-05,
"loss": 1.3324,
"step": 170
},
{
"epoch": 30.91,
"eval_loss": 1.345137119293213,
"eval_runtime": 0.6132,
"eval_samples_per_second": 287.009,
"eval_steps_per_second": 9.784,
"step": 170
},
{
"epoch": 32.0,
"eval_loss": 1.3328704833984375,
"eval_runtime": 0.6088,
"eval_samples_per_second": 289.094,
"eval_steps_per_second": 9.855,
"step": 176
},
{
"epoch": 32.73,
"learning_rate": 4.710738726825059e-05,
"loss": 1.3054,
"step": 180
},
{
"epoch": 32.91,
"eval_loss": 1.325918197631836,
"eval_runtime": 0.6149,
"eval_samples_per_second": 286.23,
"eval_steps_per_second": 9.758,
"step": 181
},
{
"epoch": 34.0,
"eval_loss": 1.3172450065612793,
"eval_runtime": 0.6114,
"eval_samples_per_second": 287.875,
"eval_steps_per_second": 9.814,
"step": 187
},
{
"epoch": 34.55,
"learning_rate": 4.671779341295378e-05,
"loss": 1.2812,
"step": 190
},
{
"epoch": 34.91,
"eval_loss": 1.311617374420166,
"eval_runtime": 0.6199,
"eval_samples_per_second": 283.926,
"eval_steps_per_second": 9.679,
"step": 192
},
{
"epoch": 36.0,
"eval_loss": 1.3009544610977173,
"eval_runtime": 0.6192,
"eval_samples_per_second": 284.26,
"eval_steps_per_second": 9.691,
"step": 198
},
{
"epoch": 36.36,
"learning_rate": 4.630542059139924e-05,
"loss": 1.2588,
"step": 200
},
{
"epoch": 36.91,
"eval_loss": 1.2909127473831177,
"eval_runtime": 0.6233,
"eval_samples_per_second": 282.357,
"eval_steps_per_second": 9.626,
"step": 203
},
{
"epoch": 38.0,
"eval_loss": 1.2811715602874756,
"eval_runtime": 0.6096,
"eval_samples_per_second": 288.711,
"eval_steps_per_second": 9.842,
"step": 209
},
{
"epoch": 38.18,
"learning_rate": 4.587070132573178e-05,
"loss": 1.2348,
"step": 210
},
{
"epoch": 38.91,
"eval_loss": 1.2706265449523926,
"eval_runtime": 0.6133,
"eval_samples_per_second": 286.993,
"eval_steps_per_second": 9.784,
"step": 214
},
{
"epoch": 40.0,
"learning_rate": 4.541409157643027e-05,
"loss": 1.2162,
"step": 220
},
{
"epoch": 40.0,
"eval_loss": 1.2609130144119263,
"eval_runtime": 0.6137,
"eval_samples_per_second": 286.801,
"eval_steps_per_second": 9.777,
"step": 220
},
{
"epoch": 40.91,
"eval_loss": 1.2516368627548218,
"eval_runtime": 0.6128,
"eval_samples_per_second": 287.191,
"eval_steps_per_second": 9.791,
"step": 225
},
{
"epoch": 41.82,
"learning_rate": 4.493607026406802e-05,
"loss": 1.1919,
"step": 230
},
{
"epoch": 42.0,
"eval_loss": 1.2416428327560425,
"eval_runtime": 0.6114,
"eval_samples_per_second": 287.857,
"eval_steps_per_second": 9.813,
"step": 231
},
{
"epoch": 42.91,
"eval_loss": 1.236132264137268,
"eval_runtime": 0.6144,
"eval_samples_per_second": 286.465,
"eval_steps_per_second": 9.766,
"step": 236
},
{
"epoch": 43.64,
"learning_rate": 4.443713876699124e-05,
"loss": 1.1616,
"step": 240
},
{
"epoch": 44.0,
"eval_loss": 1.2243030071258545,
"eval_runtime": 0.6144,
"eval_samples_per_second": 286.449,
"eval_steps_per_second": 9.765,
"step": 242
},
{
"epoch": 44.91,
"eval_loss": 1.212420105934143,
"eval_runtime": 0.6198,
"eval_samples_per_second": 283.949,
"eval_steps_per_second": 9.68,
"step": 247
},
{
"epoch": 45.45,
"learning_rate": 4.391782039544238e-05,
"loss": 1.1442,
"step": 250
},
{
"epoch": 46.0,
"eval_loss": 1.2018734216690063,
"eval_runtime": 0.6136,
"eval_samples_per_second": 286.821,
"eval_steps_per_second": 9.778,
"step": 253
},
{
"epoch": 46.91,
"eval_loss": 1.1946231126785278,
"eval_runtime": 0.6128,
"eval_samples_per_second": 287.226,
"eval_steps_per_second": 9.792,
"step": 258
},
{
"epoch": 47.27,
"learning_rate": 4.337865984268001e-05,
"loss": 1.1198,
"step": 260
},
{
"epoch": 48.0,
"eval_loss": 1.1835122108459473,
"eval_runtime": 0.6113,
"eval_samples_per_second": 287.929,
"eval_steps_per_second": 9.816,
"step": 264
},
{
"epoch": 48.91,
"eval_loss": 1.1760563850402832,
"eval_runtime": 0.6123,
"eval_samples_per_second": 287.42,
"eval_steps_per_second": 9.798,
"step": 269
},
{
"epoch": 49.09,
"learning_rate": 4.2820222613670736e-05,
"loss": 1.1057,
"step": 270
},
{
"epoch": 50.0,
"eval_loss": 1.1683567762374878,
"eval_runtime": 0.6127,
"eval_samples_per_second": 287.272,
"eval_steps_per_second": 9.793,
"step": 275
},
{
"epoch": 50.91,
"learning_rate": 4.224309443195261e-05,
"loss": 1.085,
"step": 280
},
{
"epoch": 50.91,
"eval_loss": 1.1578781604766846,
"eval_runtime": 0.6096,
"eval_samples_per_second": 288.731,
"eval_steps_per_second": 9.843,
"step": 280
},
{
"epoch": 52.0,
"eval_loss": 1.1446937322616577,
"eval_runtime": 0.6159,
"eval_samples_per_second": 285.773,
"eval_steps_per_second": 9.742,
"step": 286
},
{
"epoch": 52.73,
"learning_rate": 4.164788062529203e-05,
"loss": 1.0643,
"step": 290
},
{
"epoch": 52.91,
"eval_loss": 1.1339915990829468,
"eval_runtime": 0.6101,
"eval_samples_per_second": 288.475,
"eval_steps_per_second": 9.834,
"step": 291
},
{
"epoch": 54.0,
"eval_loss": 1.121987223625183,
"eval_runtime": 0.6079,
"eval_samples_per_second": 289.54,
"eval_steps_per_second": 9.871,
"step": 297
},
{
"epoch": 54.55,
"learning_rate": 4.10352054907785e-05,
"loss": 1.0547,
"step": 300
},
{
"epoch": 54.91,
"eval_loss": 1.111721158027649,
"eval_runtime": 0.6168,
"eval_samples_per_second": 285.363,
"eval_steps_per_second": 9.728,
"step": 302
},
{
"epoch": 56.0,
"eval_loss": 1.1022990942001343,
"eval_runtime": 0.6113,
"eval_samples_per_second": 287.898,
"eval_steps_per_second": 9.815,
"step": 308
},
{
"epoch": 56.36,
"learning_rate": 4.0405711640023186e-05,
"loss": 1.0196,
"step": 310
},
{
"epoch": 56.91,
"eval_loss": 1.0926016569137573,
"eval_runtime": 0.6138,
"eval_samples_per_second": 286.748,
"eval_steps_per_second": 9.776,
"step": 313
},
{
"epoch": 58.0,
"eval_loss": 1.0840771198272705,
"eval_runtime": 0.6136,
"eval_samples_per_second": 286.836,
"eval_steps_per_second": 9.779,
"step": 319
},
{
"epoch": 58.18,
"learning_rate": 3.976005932514807e-05,
"loss": 1.019,
"step": 320
},
{
"epoch": 58.91,
"eval_loss": 1.078185796737671,
"eval_runtime": 0.6827,
"eval_samples_per_second": 257.794,
"eval_steps_per_second": 8.788,
"step": 324
},
{
"epoch": 60.0,
"learning_rate": 3.909892574627266e-05,
"loss": 0.9804,
"step": 330
},
{
"epoch": 60.0,
"eval_loss": 1.0690256357192993,
"eval_runtime": 0.6175,
"eval_samples_per_second": 285.011,
"eval_steps_per_second": 9.716,
"step": 330
},
{
"epoch": 60.91,
"eval_loss": 1.0633498430252075,
"eval_runtime": 0.6096,
"eval_samples_per_second": 288.703,
"eval_steps_per_second": 9.842,
"step": 335
},
{
"epoch": 61.82,
"learning_rate": 3.84230043412246e-05,
"loss": 0.9693,
"step": 340
},
{
"epoch": 62.0,
"eval_loss": 1.0553932189941406,
"eval_runtime": 0.6095,
"eval_samples_per_second": 288.771,
"eval_steps_per_second": 9.844,
"step": 341
},
{
"epoch": 62.91,
"eval_loss": 1.0476597547531128,
"eval_runtime": 0.6127,
"eval_samples_per_second": 287.234,
"eval_steps_per_second": 9.792,
"step": 346
},
{
"epoch": 63.64,
"learning_rate": 3.773300405821908e-05,
"loss": 0.959,
"step": 350
},
{
"epoch": 64.0,
"eval_loss": 1.0392884016036987,
"eval_runtime": 0.6091,
"eval_samples_per_second": 288.946,
"eval_steps_per_second": 9.85,
"step": 352
},
{
"epoch": 64.91,
"eval_loss": 1.0319526195526123,
"eval_runtime": 0.6076,
"eval_samples_per_second": 289.658,
"eval_steps_per_second": 9.875,
"step": 357
},
{
"epoch": 65.45,
"learning_rate": 3.702964861227013e-05,
"loss": 0.934,
"step": 360
},
{
"epoch": 66.0,
"eval_loss": 1.0237526893615723,
"eval_runtime": 0.622,
"eval_samples_per_second": 282.945,
"eval_steps_per_second": 9.646,
"step": 363
},
{
"epoch": 66.91,
"eval_loss": 1.0172982215881348,
"eval_runtime": 0.6161,
"eval_samples_per_second": 285.649,
"eval_steps_per_second": 9.738,
"step": 368
},
{
"epoch": 67.27,
"learning_rate": 3.631367572611348e-05,
"loss": 0.9182,
"step": 370
},
{
"epoch": 68.0,
"eval_loss": 1.0093735456466675,
"eval_runtime": 0.6113,
"eval_samples_per_second": 287.888,
"eval_steps_per_second": 9.814,
"step": 374
},
{
"epoch": 68.91,
"eval_loss": 1.0004040002822876,
"eval_runtime": 0.618,
"eval_samples_per_second": 284.808,
"eval_steps_per_second": 9.709,
"step": 379
},
{
"epoch": 69.09,
"learning_rate": 3.5585836356437264e-05,
"loss": 0.9087,
"step": 380
},
{
"epoch": 70.0,
"eval_loss": 0.9954872727394104,
"eval_runtime": 0.6104,
"eval_samples_per_second": 288.329,
"eval_steps_per_second": 9.829,
"step": 385
},
{
"epoch": 70.91,
"learning_rate": 3.484689390623218e-05,
"loss": 0.8923,
"step": 390
},
{
"epoch": 70.91,
"eval_loss": 0.9893795251846313,
"eval_runtime": 0.6099,
"eval_samples_per_second": 288.583,
"eval_steps_per_second": 9.838,
"step": 390
},
{
"epoch": 72.0,
"eval_loss": 0.9800569415092468,
"eval_runtime": 0.6099,
"eval_samples_per_second": 288.549,
"eval_steps_per_second": 9.837,
"step": 396
},
{
"epoch": 72.73,
"learning_rate": 3.409762342408719e-05,
"loss": 0.8811,
"step": 400
},
{
"epoch": 72.91,
"eval_loss": 0.9731982946395874,
"eval_runtime": 0.6102,
"eval_samples_per_second": 288.43,
"eval_steps_per_second": 9.833,
"step": 401
},
{
"epoch": 74.0,
"eval_loss": 0.9653474688529968,
"eval_runtime": 0.6102,
"eval_samples_per_second": 288.428,
"eval_steps_per_second": 9.833,
"step": 407
},
{
"epoch": 74.55,
"learning_rate": 3.333881079127052e-05,
"loss": 0.8659,
"step": 410
},
{
"epoch": 74.91,
"eval_loss": 0.9584926962852478,
"eval_runtime": 0.6084,
"eval_samples_per_second": 289.293,
"eval_steps_per_second": 9.862,
"step": 412
},
{
"epoch": 76.0,
"eval_loss": 0.9506881237030029,
"eval_runtime": 0.6093,
"eval_samples_per_second": 288.844,
"eval_steps_per_second": 9.847,
"step": 418
},
{
"epoch": 76.36,
"learning_rate": 3.2571251897448765e-05,
"loss": 0.8488,
"step": 420
},
{
"epoch": 76.91,
"eval_loss": 0.9450792074203491,
"eval_runtime": 0.6111,
"eval_samples_per_second": 288.027,
"eval_steps_per_second": 9.819,
"step": 423
},
{
"epoch": 78.0,
"eval_loss": 0.9382766485214233,
"eval_runtime": 0.6157,
"eval_samples_per_second": 285.841,
"eval_steps_per_second": 9.745,
"step": 429
},
{
"epoch": 78.18,
"learning_rate": 3.1795751805908573e-05,
"loss": 0.8388,
"step": 430
},
{
"epoch": 78.91,
"eval_loss": 0.9338158965110779,
"eval_runtime": 0.6111,
"eval_samples_per_second": 288.005,
"eval_steps_per_second": 9.818,
"step": 434
},
{
"epoch": 80.0,
"learning_rate": 3.101312390915634e-05,
"loss": 0.8217,
"step": 440
},
{
"epoch": 80.0,
"eval_loss": 0.9296420812606812,
"eval_runtime": 0.6105,
"eval_samples_per_second": 288.3,
"eval_steps_per_second": 9.828,
"step": 440
},
{
"epoch": 80.91,
"eval_loss": 0.9249951839447021,
"eval_runtime": 0.6141,
"eval_samples_per_second": 286.62,
"eval_steps_per_second": 9.771,
"step": 445
},
{
"epoch": 81.82,
"learning_rate": 3.0224189075781884e-05,
"loss": 0.8126,
"step": 450
},
{
"epoch": 82.0,
"eval_loss": 0.9183826446533203,
"eval_runtime": 0.613,
"eval_samples_per_second": 287.101,
"eval_steps_per_second": 9.788,
"step": 451
},
{
"epoch": 82.91,
"eval_loss": 0.9152975678443909,
"eval_runtime": 0.616,
"eval_samples_per_second": 285.699,
"eval_steps_per_second": 9.74,
"step": 456
},
{
"epoch": 83.64,
"learning_rate": 2.9429774789480575e-05,
"loss": 0.7976,
"step": 460
},
{
"epoch": 84.0,
"eval_loss": 0.9079199433326721,
"eval_runtime": 0.6122,
"eval_samples_per_second": 287.479,
"eval_steps_per_second": 9.8,
"step": 462
},
{
"epoch": 84.91,
"eval_loss": 0.9032623767852783,
"eval_runtime": 0.6116,
"eval_samples_per_second": 287.793,
"eval_steps_per_second": 9.811,
"step": 467
},
{
"epoch": 85.45,
"learning_rate": 2.863071428113726e-05,
"loss": 0.78,
"step": 470
},
{
"epoch": 86.0,
"eval_loss": 0.8960713744163513,
"eval_runtime": 0.6103,
"eval_samples_per_second": 288.379,
"eval_steps_per_second": 9.831,
"step": 473
},
{
"epoch": 86.91,
"eval_loss": 0.8901066184043884,
"eval_runtime": 0.6174,
"eval_samples_per_second": 285.065,
"eval_steps_per_second": 9.718,
"step": 478
},
{
"epoch": 87.27,
"learning_rate": 2.782784565488211e-05,
"loss": 0.7803,
"step": 480
},
{
"epoch": 88.0,
"eval_loss": 0.8826420307159424,
"eval_runtime": 0.6114,
"eval_samples_per_second": 287.842,
"eval_steps_per_second": 9.813,
"step": 484
},
{
"epoch": 88.91,
"eval_loss": 0.8768661022186279,
"eval_runtime": 0.6139,
"eval_samples_per_second": 286.679,
"eval_steps_per_second": 9.773,
"step": 489
},
{
"epoch": 89.09,
"learning_rate": 2.7022011009035107e-05,
"loss": 0.7688,
"step": 490
},
{
"epoch": 90.0,
"eval_loss": 0.8718900680541992,
"eval_runtime": 0.6108,
"eval_samples_per_second": 288.158,
"eval_steps_per_second": 9.824,
"step": 495
},
{
"epoch": 90.91,
"learning_rate": 2.621405555286121e-05,
"loss": 0.7613,
"step": 500
},
{
"epoch": 90.91,
"eval_loss": 0.8661989569664001,
"eval_runtime": 0.605,
"eval_samples_per_second": 290.887,
"eval_steps_per_second": 9.917,
"step": 500
},
{
"epoch": 92.0,
"eval_loss": 0.860860288143158,
"eval_runtime": 0.6106,
"eval_samples_per_second": 288.244,
"eval_steps_per_second": 9.826,
"step": 506
},
{
"epoch": 92.73,
"learning_rate": 2.540482672006254e-05,
"loss": 0.7462,
"step": 510
},
{
"epoch": 92.91,
"eval_loss": 0.8560842275619507,
"eval_runtime": 0.6154,
"eval_samples_per_second": 286.006,
"eval_steps_per_second": 9.75,
"step": 511
},
{
"epoch": 94.0,
"eval_loss": 0.8527078032493591,
"eval_runtime": 0.6118,
"eval_samples_per_second": 287.658,
"eval_steps_per_second": 9.807,
"step": 517
},
{
"epoch": 94.55,
"learning_rate": 2.4595173279937464e-05,
"loss": 0.734,
"step": 520
},
{
"epoch": 94.91,
"eval_loss": 0.8490074276924133,
"eval_runtime": 0.6087,
"eval_samples_per_second": 289.16,
"eval_steps_per_second": 9.858,
"step": 522
},
{
"epoch": 96.0,
"eval_loss": 0.8411813378334045,
"eval_runtime": 0.676,
"eval_samples_per_second": 260.366,
"eval_steps_per_second": 8.876,
"step": 528
},
{
"epoch": 96.36,
"learning_rate": 2.3785944447138802e-05,
"loss": 0.7292,
"step": 530
},
{
"epoch": 96.91,
"eval_loss": 0.8365342617034912,
"eval_runtime": 0.6903,
"eval_samples_per_second": 254.96,
"eval_steps_per_second": 8.692,
"step": 533
},
{
"epoch": 98.0,
"eval_loss": 0.8290190100669861,
"eval_runtime": 0.6161,
"eval_samples_per_second": 285.662,
"eval_steps_per_second": 9.738,
"step": 539
},
{
"epoch": 98.18,
"learning_rate": 2.29779889909649e-05,
"loss": 0.7213,
"step": 540
},
{
"epoch": 98.91,
"eval_loss": 0.8227143883705139,
"eval_runtime": 0.662,
"eval_samples_per_second": 265.867,
"eval_steps_per_second": 9.064,
"step": 544
},
{
"epoch": 100.0,
"learning_rate": 2.2172154345117894e-05,
"loss": 0.7063,
"step": 550
},
{
"epoch": 100.0,
"eval_loss": 0.8168367743492126,
"eval_runtime": 0.6934,
"eval_samples_per_second": 253.815,
"eval_steps_per_second": 8.653,
"step": 550
},
{
"epoch": 100.91,
"eval_loss": 0.8129215240478516,
"eval_runtime": 0.6132,
"eval_samples_per_second": 287.006,
"eval_steps_per_second": 9.784,
"step": 555
},
{
"epoch": 101.82,
"learning_rate": 2.136928571886275e-05,
"loss": 0.695,
"step": 560
},
{
"epoch": 102.0,
"eval_loss": 0.8098950982093811,
"eval_runtime": 0.6132,
"eval_samples_per_second": 287.032,
"eval_steps_per_second": 9.785,
"step": 561
},
{
"epoch": 102.91,
"eval_loss": 0.8075168132781982,
"eval_runtime": 0.6108,
"eval_samples_per_second": 288.158,
"eval_steps_per_second": 9.824,
"step": 566
},
{
"epoch": 103.64,
"learning_rate": 2.0570225210519434e-05,
"loss": 0.6827,
"step": 570
},
{
"epoch": 104.0,
"eval_loss": 0.8030515313148499,
"eval_runtime": 0.6149,
"eval_samples_per_second": 286.224,
"eval_steps_per_second": 9.758,
"step": 572
},
{
"epoch": 104.91,
"eval_loss": 0.7993787527084351,
"eval_runtime": 0.6164,
"eval_samples_per_second": 285.523,
"eval_steps_per_second": 9.734,
"step": 577
},
{
"epoch": 105.45,
"learning_rate": 1.9775810924218125e-05,
"loss": 0.6778,
"step": 580
},
{
"epoch": 106.0,
"eval_loss": 0.7955842614173889,
"eval_runtime": 0.6123,
"eval_samples_per_second": 287.451,
"eval_steps_per_second": 9.799,
"step": 583
},
{
"epoch": 106.91,
"eval_loss": 0.7906305193901062,
"eval_runtime": 0.6081,
"eval_samples_per_second": 289.432,
"eval_steps_per_second": 9.867,
"step": 588
},
{
"epoch": 107.27,
"learning_rate": 1.8986876090843667e-05,
"loss": 0.6753,
"step": 590
},
{
"epoch": 108.0,
"eval_loss": 0.7853391766548157,
"eval_runtime": 0.6095,
"eval_samples_per_second": 288.773,
"eval_steps_per_second": 9.845,
"step": 594
},
{
"epoch": 108.91,
"eval_loss": 0.7820215225219727,
"eval_runtime": 0.618,
"eval_samples_per_second": 284.769,
"eval_steps_per_second": 9.708,
"step": 599
},
{
"epoch": 109.09,
"learning_rate": 1.820424819409143e-05,
"loss": 0.6654,
"step": 600
},
{
"epoch": 110.0,
"eval_loss": 0.7762172222137451,
"eval_runtime": 0.6112,
"eval_samples_per_second": 287.94,
"eval_steps_per_second": 9.816,
"step": 605
},
{
"epoch": 110.91,
"learning_rate": 1.7428748102551237e-05,
"loss": 0.6621,
"step": 610
},
{
"epoch": 110.91,
"eval_loss": 0.7716686725616455,
"eval_runtime": 0.6158,
"eval_samples_per_second": 285.825,
"eval_steps_per_second": 9.744,
"step": 610
},
{
"epoch": 112.0,
"eval_loss": 0.7677438855171204,
"eval_runtime": 0.6074,
"eval_samples_per_second": 289.75,
"eval_steps_per_second": 9.878,
"step": 616
},
{
"epoch": 112.73,
"learning_rate": 1.666118920872949e-05,
"loss": 0.6579,
"step": 620
},
{
"epoch": 112.91,
"eval_loss": 0.7632172107696533,
"eval_runtime": 0.6086,
"eval_samples_per_second": 289.198,
"eval_steps_per_second": 9.859,
"step": 621
},
{
"epoch": 114.0,
"eval_loss": 0.760553777217865,
"eval_runtime": 0.6094,
"eval_samples_per_second": 288.804,
"eval_steps_per_second": 9.846,
"step": 627
},
{
"epoch": 114.55,
"learning_rate": 1.5902376575912815e-05,
"loss": 0.6412,
"step": 630
},
{
"epoch": 114.91,
"eval_loss": 0.7592495083808899,
"eval_runtime": 0.6136,
"eval_samples_per_second": 286.828,
"eval_steps_per_second": 9.778,
"step": 632
},
{
"epoch": 116.0,
"eval_loss": 0.7567878365516663,
"eval_runtime": 0.6111,
"eval_samples_per_second": 288.02,
"eval_steps_per_second": 9.819,
"step": 638
},
{
"epoch": 116.36,
"learning_rate": 1.5153106093767827e-05,
"loss": 0.6403,
"step": 640
},
{
"epoch": 116.91,
"eval_loss": 0.7550404071807861,
"eval_runtime": 0.6079,
"eval_samples_per_second": 289.517,
"eval_steps_per_second": 9.87,
"step": 643
},
{
"epoch": 118.0,
"eval_loss": 0.7509838342666626,
"eval_runtime": 0.6082,
"eval_samples_per_second": 289.365,
"eval_steps_per_second": 9.865,
"step": 649
},
{
"epoch": 118.18,
"learning_rate": 1.4414163643562755e-05,
"loss": 0.6358,
"step": 650
},
{
"epoch": 118.91,
"eval_loss": 0.7504873871803284,
"eval_runtime": 0.6087,
"eval_samples_per_second": 289.117,
"eval_steps_per_second": 9.856,
"step": 654
},
{
"epoch": 120.0,
"learning_rate": 1.368632427388653e-05,
"loss": 0.6328,
"step": 660
},
{
"epoch": 120.0,
"eval_loss": 0.747511625289917,
"eval_runtime": 0.6084,
"eval_samples_per_second": 289.265,
"eval_steps_per_second": 9.861,
"step": 660
},
{
"epoch": 120.91,
"eval_loss": 0.7451759576797485,
"eval_runtime": 0.6218,
"eval_samples_per_second": 283.031,
"eval_steps_per_second": 9.649,
"step": 665
},
{
"epoch": 121.82,
"learning_rate": 1.2970351387729873e-05,
"loss": 0.6233,
"step": 670
},
{
"epoch": 122.0,
"eval_loss": 0.7407442331314087,
"eval_runtime": 0.6125,
"eval_samples_per_second": 287.333,
"eval_steps_per_second": 9.795,
"step": 671
},
{
"epoch": 122.91,
"eval_loss": 0.7380485534667969,
"eval_runtime": 0.6166,
"eval_samples_per_second": 285.455,
"eval_steps_per_second": 9.731,
"step": 676
},
{
"epoch": 123.64,
"learning_rate": 1.2266995941780934e-05,
"loss": 0.6101,
"step": 680
},
{
"epoch": 124.0,
"eval_loss": 0.7349163293838501,
"eval_runtime": 0.6112,
"eval_samples_per_second": 287.969,
"eval_steps_per_second": 9.817,
"step": 682
},
{
"epoch": 124.91,
"eval_loss": 0.7321335077285767,
"eval_runtime": 0.6135,
"eval_samples_per_second": 286.879,
"eval_steps_per_second": 9.78,
"step": 687
},
{
"epoch": 125.45,
"learning_rate": 1.1576995658775405e-05,
"loss": 0.6161,
"step": 690
},
{
"epoch": 126.0,
"eval_loss": 0.729712724685669,
"eval_runtime": 0.6087,
"eval_samples_per_second": 289.133,
"eval_steps_per_second": 9.857,
"step": 693
},
{
"epoch": 126.91,
"eval_loss": 0.7279945611953735,
"eval_runtime": 0.6139,
"eval_samples_per_second": 286.689,
"eval_steps_per_second": 9.773,
"step": 698
},
{
"epoch": 127.27,
"learning_rate": 1.0901074253727336e-05,
"loss": 0.6099,
"step": 700
},
{
"epoch": 128.0,
"eval_loss": 0.7247954607009888,
"eval_runtime": 0.6118,
"eval_samples_per_second": 287.661,
"eval_steps_per_second": 9.807,
"step": 704
},
{
"epoch": 128.91,
"eval_loss": 0.7223231792449951,
"eval_runtime": 0.6126,
"eval_samples_per_second": 287.323,
"eval_steps_per_second": 9.795,
"step": 709
},
{
"epoch": 129.09,
"learning_rate": 1.0239940674851941e-05,
"loss": 0.6051,
"step": 710
},
{
"epoch": 130.0,
"eval_loss": 0.7180494666099548,
"eval_runtime": 0.6126,
"eval_samples_per_second": 287.293,
"eval_steps_per_second": 9.794,
"step": 715
},
{
"epoch": 130.91,
"learning_rate": 9.594288359976817e-06,
"loss": 0.6066,
"step": 720
},
{
"epoch": 130.91,
"eval_loss": 0.7157444357872009,
"eval_runtime": 0.6097,
"eval_samples_per_second": 288.682,
"eval_steps_per_second": 9.841,
"step": 720
},
{
"epoch": 132.0,
"eval_loss": 0.7136554718017578,
"eval_runtime": 0.6158,
"eval_samples_per_second": 285.829,
"eval_steps_per_second": 9.744,
"step": 726
},
{
"epoch": 132.73,
"learning_rate": 8.964794509221508e-06,
"loss": 0.5919,
"step": 730
},
{
"epoch": 132.91,
"eval_loss": 0.7124893665313721,
"eval_runtime": 0.6109,
"eval_samples_per_second": 288.085,
"eval_steps_per_second": 9.821,
"step": 731
},
{
"epoch": 134.0,
"eval_loss": 0.7105648517608643,
"eval_runtime": 0.6159,
"eval_samples_per_second": 285.767,
"eval_steps_per_second": 9.742,
"step": 737
},
{
"epoch": 134.55,
"learning_rate": 8.352119374707978e-06,
"loss": 0.5947,
"step": 740
},
{
"epoch": 134.91,
"eval_loss": 0.7095364928245544,
"eval_runtime": 0.6176,
"eval_samples_per_second": 284.964,
"eval_steps_per_second": 9.715,
"step": 742
},
{
"epoch": 136.0,
"eval_loss": 0.7085996866226196,
"eval_runtime": 0.609,
"eval_samples_per_second": 288.985,
"eval_steps_per_second": 9.852,
"step": 748
},
{
"epoch": 136.36,
"learning_rate": 7.756905568047393e-06,
"loss": 0.5847,
"step": 750
},
{
"epoch": 136.91,
"eval_loss": 0.7068153023719788,
"eval_runtime": 0.613,
"eval_samples_per_second": 287.113,
"eval_steps_per_second": 9.788,
"step": 753
},
{
"epoch": 138.0,
"eval_loss": 0.704888105392456,
"eval_runtime": 0.6127,
"eval_samples_per_second": 287.267,
"eval_steps_per_second": 9.793,
"step": 759
},
{
"epoch": 138.18,
"learning_rate": 7.179777386329276e-06,
"loss": 0.5801,
"step": 760
},
{
"epoch": 138.91,
"eval_loss": 0.7037582993507385,
"eval_runtime": 0.614,
"eval_samples_per_second": 286.667,
"eval_steps_per_second": 9.773,
"step": 764
},
{
"epoch": 140.0,
"learning_rate": 6.621340157319997e-06,
"loss": 0.5777,
"step": 770
},
{
"epoch": 140.0,
"eval_loss": 0.7029330730438232,
"eval_runtime": 0.6156,
"eval_samples_per_second": 285.908,
"eval_steps_per_second": 9.747,
"step": 770
},
{
"epoch": 140.91,
"eval_loss": 0.702335774898529,
"eval_runtime": 0.6801,
"eval_samples_per_second": 258.786,
"eval_steps_per_second": 8.822,
"step": 775
},
{
"epoch": 141.82,
"learning_rate": 6.082179604557617e-06,
"loss": 0.5775,
"step": 780
},
{
"epoch": 142.0,
"eval_loss": 0.701381266117096,
"eval_runtime": 0.6168,
"eval_samples_per_second": 285.34,
"eval_steps_per_second": 9.728,
"step": 781
},
{
"epoch": 142.91,
"eval_loss": 0.6989457011222839,
"eval_runtime": 0.6106,
"eval_samples_per_second": 288.22,
"eval_steps_per_second": 9.826,
"step": 786
},
{
"epoch": 143.64,
"learning_rate": 5.562861233008774e-06,
"loss": 0.5766,
"step": 790
},
{
"epoch": 144.0,
"eval_loss": 0.696780264377594,
"eval_runtime": 0.6081,
"eval_samples_per_second": 289.427,
"eval_steps_per_second": 9.867,
"step": 792
},
{
"epoch": 144.91,
"eval_loss": 0.6954607367515564,
"eval_runtime": 0.6121,
"eval_samples_per_second": 287.553,
"eval_steps_per_second": 9.803,
"step": 797
},
{
"epoch": 145.45,
"learning_rate": 5.063929735931985e-06,
"loss": 0.5799,
"step": 800
},
{
"epoch": 146.0,
"eval_loss": 0.694266140460968,
"eval_runtime": 0.6134,
"eval_samples_per_second": 286.919,
"eval_steps_per_second": 9.781,
"step": 803
},
{
"epoch": 146.91,
"eval_loss": 0.6934513449668884,
"eval_runtime": 0.6125,
"eval_samples_per_second": 287.341,
"eval_steps_per_second": 9.796,
"step": 808
},
{
"epoch": 147.27,
"learning_rate": 4.585908423569724e-06,
"loss": 0.5724,
"step": 810
},
{
"epoch": 148.0,
"eval_loss": 0.6923357248306274,
"eval_runtime": 0.6116,
"eval_samples_per_second": 287.776,
"eval_steps_per_second": 9.811,
"step": 814
},
{
"epoch": 148.91,
"eval_loss": 0.6916863918304443,
"eval_runtime": 0.6209,
"eval_samples_per_second": 283.459,
"eval_steps_per_second": 9.663,
"step": 819
},
{
"epoch": 149.09,
"learning_rate": 4.129298674268225e-06,
"loss": 0.5636,
"step": 820
},
{
"epoch": 150.0,
"eval_loss": 0.6903253197669983,
"eval_runtime": 0.6129,
"eval_samples_per_second": 287.181,
"eval_steps_per_second": 9.79,
"step": 825
},
{
"epoch": 150.91,
"learning_rate": 3.694579408600771e-06,
"loss": 0.5718,
"step": 830
},
{
"epoch": 150.91,
"eval_loss": 0.6895003318786621,
"eval_runtime": 0.612,
"eval_samples_per_second": 287.574,
"eval_steps_per_second": 9.804,
"step": 830
},
{
"epoch": 152.0,
"eval_loss": 0.6892096996307373,
"eval_runtime": 0.6102,
"eval_samples_per_second": 288.427,
"eval_steps_per_second": 9.833,
"step": 836
},
{
"epoch": 152.73,
"learning_rate": 3.2822065870462217e-06,
"loss": 0.5656,
"step": 840
},
{
"epoch": 152.91,
"eval_loss": 0.6885128617286682,
"eval_runtime": 0.6079,
"eval_samples_per_second": 289.516,
"eval_steps_per_second": 9.87,
"step": 841
},
{
"epoch": 154.0,
"eval_loss": 0.6882739067077637,
"eval_runtime": 0.6116,
"eval_samples_per_second": 287.773,
"eval_steps_per_second": 9.81,
"step": 847
},
{
"epoch": 154.55,
"learning_rate": 2.892612731749414e-06,
"loss": 0.5642,
"step": 850
},
{
"epoch": 154.91,
"eval_loss": 0.6874551773071289,
"eval_runtime": 0.6157,
"eval_samples_per_second": 285.833,
"eval_steps_per_second": 9.744,
"step": 852
},
{
"epoch": 156.0,
"eval_loss": 0.6862676739692688,
"eval_runtime": 0.6125,
"eval_samples_per_second": 287.37,
"eval_steps_per_second": 9.797,
"step": 858
},
{
"epoch": 156.36,
"learning_rate": 2.52620647286512e-06,
"loss": 0.5681,
"step": 860
},
{
"epoch": 156.91,
"eval_loss": 0.6856226325035095,
"eval_runtime": 0.6086,
"eval_samples_per_second": 289.209,
"eval_steps_per_second": 9.859,
"step": 863
},
{
"epoch": 158.0,
"eval_loss": 0.6847879886627197,
"eval_runtime": 0.6112,
"eval_samples_per_second": 287.948,
"eval_steps_per_second": 9.816,
"step": 869
},
{
"epoch": 158.18,
"learning_rate": 2.183372119961499e-06,
"loss": 0.5618,
"step": 870
},
{
"epoch": 158.91,
"eval_loss": 0.6842586398124695,
"eval_runtime": 0.6143,
"eval_samples_per_second": 286.51,
"eval_steps_per_second": 9.767,
"step": 874
},
{
"epoch": 160.0,
"learning_rate": 1.864469258932397e-06,
"loss": 0.5485,
"step": 880
},
{
"epoch": 160.0,
"eval_loss": 0.6837961077690125,
"eval_runtime": 0.6085,
"eval_samples_per_second": 289.259,
"eval_steps_per_second": 9.861,
"step": 880
},
{
"epoch": 160.91,
"eval_loss": 0.6834523677825928,
"eval_runtime": 0.609,
"eval_samples_per_second": 288.988,
"eval_steps_per_second": 9.852,
"step": 885
},
{
"epoch": 161.82,
"learning_rate": 1.5698323748414124e-06,
"loss": 0.5611,
"step": 890
},
{
"epoch": 162.0,
"eval_loss": 0.6828814744949341,
"eval_runtime": 0.6118,
"eval_samples_per_second": 287.681,
"eval_steps_per_second": 9.807,
"step": 891
},
{
"epoch": 162.91,
"eval_loss": 0.6825764775276184,
"eval_runtime": 0.6147,
"eval_samples_per_second": 286.328,
"eval_steps_per_second": 9.761,
"step": 896
},
{
"epoch": 163.64,
"learning_rate": 1.2997705010932393e-06,
"loss": 0.5552,
"step": 900
},
{
"epoch": 164.0,
"eval_loss": 0.6823921203613281,
"eval_runtime": 0.6119,
"eval_samples_per_second": 287.631,
"eval_steps_per_second": 9.806,
"step": 902
},
{
"epoch": 164.91,
"eval_loss": 0.6820687651634216,
"eval_runtime": 0.6085,
"eval_samples_per_second": 289.234,
"eval_steps_per_second": 9.86,
"step": 907
},
{
"epoch": 165.45,
"learning_rate": 1.0545668953003241e-06,
"loss": 0.5675,
"step": 910
},
{
"epoch": 166.0,
"eval_loss": 0.6815680861473083,
"eval_runtime": 0.6088,
"eval_samples_per_second": 289.108,
"eval_steps_per_second": 9.856,
"step": 913
},
{
"epoch": 166.91,
"eval_loss": 0.6814342141151428,
"eval_runtime": 0.6085,
"eval_samples_per_second": 289.246,
"eval_steps_per_second": 9.861,
"step": 918
},
{
"epoch": 167.27,
"learning_rate": 8.344787421847217e-07,
"loss": 0.5586,
"step": 920
},
{
"epoch": 168.0,
"eval_loss": 0.6812211871147156,
"eval_runtime": 0.6108,
"eval_samples_per_second": 288.154,
"eval_steps_per_second": 9.823,
"step": 924
},
{
"epoch": 168.91,
"eval_loss": 0.6808667778968811,
"eval_runtime": 0.6126,
"eval_samples_per_second": 287.282,
"eval_steps_per_second": 9.794,
"step": 929
},
{
"epoch": 169.09,
"learning_rate": 6.397368838268497e-07,
"loss": 0.5577,
"step": 930
},
{
"epoch": 170.0,
"eval_loss": 0.6807306408882141,
"eval_runtime": 0.6126,
"eval_samples_per_second": 287.3,
"eval_steps_per_second": 9.794,
"step": 935
},
{
"epoch": 170.91,
"learning_rate": 4.7054557754402373e-07,
"loss": 0.5487,
"step": 940
},
{
"epoch": 170.91,
"eval_loss": 0.6806881427764893,
"eval_runtime": 0.6111,
"eval_samples_per_second": 288.019,
"eval_steps_per_second": 9.819,
"step": 940
},
{
"epoch": 172.0,
"eval_loss": 0.6806466579437256,
"eval_runtime": 0.6156,
"eval_samples_per_second": 285.889,
"eval_steps_per_second": 9.746,
"step": 946
},
{
"epoch": 172.73,
"learning_rate": 3.270822816527325e-07,
"loss": 0.5532,
"step": 950
},
{
"epoch": 172.91,
"eval_loss": 0.6805295348167419,
"eval_runtime": 0.6098,
"eval_samples_per_second": 288.61,
"eval_steps_per_second": 9.839,
"step": 951
},
{
"epoch": 174.0,
"eval_loss": 0.6804378628730774,
"eval_runtime": 0.6144,
"eval_samples_per_second": 286.48,
"eval_steps_per_second": 9.766,
"step": 957
},
{
"epoch": 174.55,
"learning_rate": 2.094974693393731e-07,
"loss": 0.5601,
"step": 960
},
{
"epoch": 174.91,
"eval_loss": 0.6803626418113708,
"eval_runtime": 0.6093,
"eval_samples_per_second": 288.837,
"eval_steps_per_second": 9.847,
"step": 962
},
{
"epoch": 176.0,
"eval_loss": 0.6802287101745605,
"eval_runtime": 0.6105,
"eval_samples_per_second": 288.293,
"eval_steps_per_second": 9.828,
"step": 968
},
{
"epoch": 176.36,
"learning_rate": 1.1791447083465134e-07,
"loss": 0.5558,
"step": 970
},
{
"epoch": 176.91,
"eval_loss": 0.6801658272743225,
"eval_runtime": 0.6151,
"eval_samples_per_second": 286.123,
"eval_steps_per_second": 9.754,
"step": 973
},
{
"epoch": 178.0,
"eval_loss": 0.6801168322563171,
"eval_runtime": 0.6101,
"eval_samples_per_second": 288.471,
"eval_steps_per_second": 9.834,
"step": 979
},
{
"epoch": 178.18,
"learning_rate": 5.242934405720879e-08,
"loss": 0.5601,
"step": 980
},
{
"epoch": 178.91,
"eval_loss": 0.6801341772079468,
"eval_runtime": 0.6154,
"eval_samples_per_second": 285.986,
"eval_steps_per_second": 9.75,
"step": 984
},
{
"epoch": 180.0,
"learning_rate": 1.3110773862126669e-08,
"loss": 0.5612,
"step": 990
},
{
"epoch": 180.0,
"eval_loss": 0.680082380771637,
"eval_runtime": 0.6089,
"eval_samples_per_second": 289.031,
"eval_steps_per_second": 9.853,
"step": 990
},
{
"epoch": 180.91,
"eval_loss": 0.6800580620765686,
"eval_runtime": 0.6814,
"eval_samples_per_second": 258.31,
"eval_steps_per_second": 8.806,
"step": 995
},
{
"epoch": 181.82,
"learning_rate": 0.0,
"loss": 0.5508,
"step": 1000
},
{
"epoch": 181.82,
"eval_loss": 0.6800865530967712,
"eval_runtime": 0.6303,
"eval_samples_per_second": 279.23,
"eval_steps_per_second": 9.519,
"step": 1000
},
{
"epoch": 181.82,
"step": 1000,
"total_flos": 1.2949054332736635e+18,
"train_loss": 0.8898317427635193,
"train_runtime": 1547.2406,
"train_samples_per_second": 90.871,
"train_steps_per_second": 0.646
}
],
"max_steps": 1000,
"num_train_epochs": 200,
"total_flos": 1.2949054332736635e+18,
"trial_name": null,
"trial_params": null
}