generated from xuyuqing/ailab
2082 lines
48 KiB
JSON
2082 lines
48 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 181.8181818181818,
|
|
"global_step": 1000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.91,
|
|
"eval_loss": 1.7933429479599,
|
|
"eval_runtime": 0.607,
|
|
"eval_samples_per_second": 289.962,
|
|
"eval_steps_per_second": 9.885,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 1.82,
|
|
"learning_rate": 1.6666666666666667e-05,
|
|
"loss": 1.8069,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 1.7829440832138062,
|
|
"eval_runtime": 0.6089,
|
|
"eval_samples_per_second": 289.045,
|
|
"eval_steps_per_second": 9.854,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 2.91,
|
|
"eval_loss": 1.7655202150344849,
|
|
"eval_runtime": 0.6114,
|
|
"eval_samples_per_second": 287.852,
|
|
"eval_steps_per_second": 9.813,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 3.64,
|
|
"learning_rate": 3.3333333333333335e-05,
|
|
"loss": 1.7929,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_loss": 1.7314362525939941,
|
|
"eval_runtime": 0.6066,
|
|
"eval_samples_per_second": 290.12,
|
|
"eval_steps_per_second": 9.89,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 4.91,
|
|
"eval_loss": 1.687442421913147,
|
|
"eval_runtime": 0.6137,
|
|
"eval_samples_per_second": 286.793,
|
|
"eval_steps_per_second": 9.777,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 5.45,
|
|
"learning_rate": 5e-05,
|
|
"loss": 1.7324,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_loss": 1.6319729089736938,
|
|
"eval_runtime": 0.6106,
|
|
"eval_samples_per_second": 288.249,
|
|
"eval_steps_per_second": 9.827,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 6.91,
|
|
"eval_loss": 1.6018821001052856,
|
|
"eval_runtime": 0.6146,
|
|
"eval_samples_per_second": 286.357,
|
|
"eval_steps_per_second": 9.762,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 7.27,
|
|
"learning_rate": 4.998688922613788e-05,
|
|
"loss": 1.6419,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"eval_loss": 1.5704588890075684,
|
|
"eval_runtime": 0.6099,
|
|
"eval_samples_per_second": 288.582,
|
|
"eval_steps_per_second": 9.838,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 8.91,
|
|
"eval_loss": 1.5536445379257202,
|
|
"eval_runtime": 0.6172,
|
|
"eval_samples_per_second": 285.154,
|
|
"eval_steps_per_second": 9.721,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 9.09,
|
|
"learning_rate": 4.9947570655942796e-05,
|
|
"loss": 1.5731,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"eval_loss": 1.5355907678604126,
|
|
"eval_runtime": 0.6124,
|
|
"eval_samples_per_second": 287.408,
|
|
"eval_steps_per_second": 9.798,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 10.91,
|
|
"learning_rate": 4.988208552916535e-05,
|
|
"loss": 1.5498,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 10.91,
|
|
"eval_loss": 1.5225105285644531,
|
|
"eval_runtime": 0.6148,
|
|
"eval_samples_per_second": 286.264,
|
|
"eval_steps_per_second": 9.759,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"eval_loss": 1.509987235069275,
|
|
"eval_runtime": 0.6161,
|
|
"eval_samples_per_second": 285.65,
|
|
"eval_steps_per_second": 9.738,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 12.73,
|
|
"learning_rate": 4.9790502530660635e-05,
|
|
"loss": 1.5188,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 12.91,
|
|
"eval_loss": 1.5007643699645996,
|
|
"eval_runtime": 0.6136,
|
|
"eval_samples_per_second": 286.848,
|
|
"eval_steps_per_second": 9.779,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 14.0,
|
|
"eval_loss": 1.4901142120361328,
|
|
"eval_runtime": 0.6114,
|
|
"eval_samples_per_second": 287.851,
|
|
"eval_steps_per_second": 9.813,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 14.55,
|
|
"learning_rate": 4.967291771834727e-05,
|
|
"loss": 1.5005,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 14.91,
|
|
"eval_loss": 1.481990098953247,
|
|
"eval_runtime": 0.6166,
|
|
"eval_samples_per_second": 285.454,
|
|
"eval_steps_per_second": 9.731,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"eval_loss": 1.4730418920516968,
|
|
"eval_runtime": 0.6164,
|
|
"eval_samples_per_second": 285.532,
|
|
"eval_steps_per_second": 9.734,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 16.36,
|
|
"learning_rate": 4.9529454422455976e-05,
|
|
"loss": 1.4814,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 16.91,
|
|
"eval_loss": 1.4652411937713623,
|
|
"eval_runtime": 0.6742,
|
|
"eval_samples_per_second": 261.038,
|
|
"eval_steps_per_second": 8.899,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 18.0,
|
|
"eval_loss": 1.4562301635742188,
|
|
"eval_runtime": 0.6889,
|
|
"eval_samples_per_second": 255.479,
|
|
"eval_steps_per_second": 8.71,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 18.18,
|
|
"learning_rate": 4.936026311617316e-05,
|
|
"loss": 1.4618,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 18.91,
|
|
"eval_loss": 1.4488506317138672,
|
|
"eval_runtime": 0.6201,
|
|
"eval_samples_per_second": 283.827,
|
|
"eval_steps_per_second": 9.676,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"learning_rate": 4.916552125781528e-05,
|
|
"loss": 1.4468,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"eval_loss": 1.4396686553955078,
|
|
"eval_runtime": 0.6087,
|
|
"eval_samples_per_second": 289.16,
|
|
"eval_steps_per_second": 9.858,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 20.91,
|
|
"eval_loss": 1.4325110912322998,
|
|
"eval_runtime": 0.6159,
|
|
"eval_samples_per_second": 285.768,
|
|
"eval_steps_per_second": 9.742,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 21.82,
|
|
"learning_rate": 4.894543310469968e-05,
|
|
"loss": 1.4198,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 22.0,
|
|
"eval_loss": 1.4230388402938843,
|
|
"eval_runtime": 0.6122,
|
|
"eval_samples_per_second": 287.508,
|
|
"eval_steps_per_second": 9.801,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 22.91,
|
|
"eval_loss": 1.4148945808410645,
|
|
"eval_runtime": 0.6173,
|
|
"eval_samples_per_second": 285.131,
|
|
"eval_steps_per_second": 9.72,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 23.64,
|
|
"learning_rate": 4.870022949890676e-05,
|
|
"loss": 1.4001,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"eval_loss": 1.4044132232666016,
|
|
"eval_runtime": 0.6099,
|
|
"eval_samples_per_second": 288.552,
|
|
"eval_steps_per_second": 9.837,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 24.91,
|
|
"eval_loss": 1.3964232206344604,
|
|
"eval_runtime": 0.6151,
|
|
"eval_samples_per_second": 286.143,
|
|
"eval_steps_per_second": 9.755,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 25.45,
|
|
"learning_rate": 4.8430167625158595e-05,
|
|
"loss": 1.3809,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 26.0,
|
|
"eval_loss": 1.3872698545455933,
|
|
"eval_runtime": 0.6134,
|
|
"eval_samples_per_second": 286.947,
|
|
"eval_steps_per_second": 9.782,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 26.91,
|
|
"eval_loss": 1.3796738386154175,
|
|
"eval_runtime": 0.6136,
|
|
"eval_samples_per_second": 286.826,
|
|
"eval_steps_per_second": 9.778,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 27.27,
|
|
"learning_rate": 4.813553074106761e-05,
|
|
"loss": 1.3763,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 28.0,
|
|
"eval_loss": 1.3704036474227905,
|
|
"eval_runtime": 0.6104,
|
|
"eval_samples_per_second": 288.339,
|
|
"eval_steps_per_second": 9.83,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 28.91,
|
|
"eval_loss": 1.3635640144348145,
|
|
"eval_runtime": 0.6147,
|
|
"eval_samples_per_second": 286.317,
|
|
"eval_steps_per_second": 9.761,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 29.09,
|
|
"learning_rate": 4.781662788003851e-05,
|
|
"loss": 1.3428,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"eval_loss": 1.352755069732666,
|
|
"eval_runtime": 0.6087,
|
|
"eval_samples_per_second": 289.163,
|
|
"eval_steps_per_second": 9.858,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 30.91,
|
|
"learning_rate": 4.747379352713489e-05,
|
|
"loss": 1.3324,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 30.91,
|
|
"eval_loss": 1.345137119293213,
|
|
"eval_runtime": 0.6132,
|
|
"eval_samples_per_second": 287.009,
|
|
"eval_steps_per_second": 9.784,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 32.0,
|
|
"eval_loss": 1.3328704833984375,
|
|
"eval_runtime": 0.6088,
|
|
"eval_samples_per_second": 289.094,
|
|
"eval_steps_per_second": 9.855,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 32.73,
|
|
"learning_rate": 4.710738726825059e-05,
|
|
"loss": 1.3054,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 32.91,
|
|
"eval_loss": 1.325918197631836,
|
|
"eval_runtime": 0.6149,
|
|
"eval_samples_per_second": 286.23,
|
|
"eval_steps_per_second": 9.758,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 34.0,
|
|
"eval_loss": 1.3172450065612793,
|
|
"eval_runtime": 0.6114,
|
|
"eval_samples_per_second": 287.875,
|
|
"eval_steps_per_second": 9.814,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 34.55,
|
|
"learning_rate": 4.671779341295378e-05,
|
|
"loss": 1.2812,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 34.91,
|
|
"eval_loss": 1.311617374420166,
|
|
"eval_runtime": 0.6199,
|
|
"eval_samples_per_second": 283.926,
|
|
"eval_steps_per_second": 9.679,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 36.0,
|
|
"eval_loss": 1.3009544610977173,
|
|
"eval_runtime": 0.6192,
|
|
"eval_samples_per_second": 284.26,
|
|
"eval_steps_per_second": 9.691,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 36.36,
|
|
"learning_rate": 4.630542059139924e-05,
|
|
"loss": 1.2588,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 36.91,
|
|
"eval_loss": 1.2909127473831177,
|
|
"eval_runtime": 0.6233,
|
|
"eval_samples_per_second": 282.357,
|
|
"eval_steps_per_second": 9.626,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 38.0,
|
|
"eval_loss": 1.2811715602874756,
|
|
"eval_runtime": 0.6096,
|
|
"eval_samples_per_second": 288.711,
|
|
"eval_steps_per_second": 9.842,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 38.18,
|
|
"learning_rate": 4.587070132573178e-05,
|
|
"loss": 1.2348,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 38.91,
|
|
"eval_loss": 1.2706265449523926,
|
|
"eval_runtime": 0.6133,
|
|
"eval_samples_per_second": 286.993,
|
|
"eval_steps_per_second": 9.784,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 40.0,
|
|
"learning_rate": 4.541409157643027e-05,
|
|
"loss": 1.2162,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 40.0,
|
|
"eval_loss": 1.2609130144119263,
|
|
"eval_runtime": 0.6137,
|
|
"eval_samples_per_second": 286.801,
|
|
"eval_steps_per_second": 9.777,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 40.91,
|
|
"eval_loss": 1.2516368627548218,
|
|
"eval_runtime": 0.6128,
|
|
"eval_samples_per_second": 287.191,
|
|
"eval_steps_per_second": 9.791,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 41.82,
|
|
"learning_rate": 4.493607026406802e-05,
|
|
"loss": 1.1919,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 42.0,
|
|
"eval_loss": 1.2416428327560425,
|
|
"eval_runtime": 0.6114,
|
|
"eval_samples_per_second": 287.857,
|
|
"eval_steps_per_second": 9.813,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 42.91,
|
|
"eval_loss": 1.236132264137268,
|
|
"eval_runtime": 0.6144,
|
|
"eval_samples_per_second": 286.465,
|
|
"eval_steps_per_second": 9.766,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 43.64,
|
|
"learning_rate": 4.443713876699124e-05,
|
|
"loss": 1.1616,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 44.0,
|
|
"eval_loss": 1.2243030071258545,
|
|
"eval_runtime": 0.6144,
|
|
"eval_samples_per_second": 286.449,
|
|
"eval_steps_per_second": 9.765,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 44.91,
|
|
"eval_loss": 1.212420105934143,
|
|
"eval_runtime": 0.6198,
|
|
"eval_samples_per_second": 283.949,
|
|
"eval_steps_per_second": 9.68,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 45.45,
|
|
"learning_rate": 4.391782039544238e-05,
|
|
"loss": 1.1442,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 46.0,
|
|
"eval_loss": 1.2018734216690063,
|
|
"eval_runtime": 0.6136,
|
|
"eval_samples_per_second": 286.821,
|
|
"eval_steps_per_second": 9.778,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 46.91,
|
|
"eval_loss": 1.1946231126785278,
|
|
"eval_runtime": 0.6128,
|
|
"eval_samples_per_second": 287.226,
|
|
"eval_steps_per_second": 9.792,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 47.27,
|
|
"learning_rate": 4.337865984268001e-05,
|
|
"loss": 1.1198,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 48.0,
|
|
"eval_loss": 1.1835122108459473,
|
|
"eval_runtime": 0.6113,
|
|
"eval_samples_per_second": 287.929,
|
|
"eval_steps_per_second": 9.816,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 48.91,
|
|
"eval_loss": 1.1760563850402832,
|
|
"eval_runtime": 0.6123,
|
|
"eval_samples_per_second": 287.42,
|
|
"eval_steps_per_second": 9.798,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 49.09,
|
|
"learning_rate": 4.2820222613670736e-05,
|
|
"loss": 1.1057,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 50.0,
|
|
"eval_loss": 1.1683567762374878,
|
|
"eval_runtime": 0.6127,
|
|
"eval_samples_per_second": 287.272,
|
|
"eval_steps_per_second": 9.793,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 50.91,
|
|
"learning_rate": 4.224309443195261e-05,
|
|
"loss": 1.085,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 50.91,
|
|
"eval_loss": 1.1578781604766846,
|
|
"eval_runtime": 0.6096,
|
|
"eval_samples_per_second": 288.731,
|
|
"eval_steps_per_second": 9.843,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 52.0,
|
|
"eval_loss": 1.1446937322616577,
|
|
"eval_runtime": 0.6159,
|
|
"eval_samples_per_second": 285.773,
|
|
"eval_steps_per_second": 9.742,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 52.73,
|
|
"learning_rate": 4.164788062529203e-05,
|
|
"loss": 1.0643,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 52.91,
|
|
"eval_loss": 1.1339915990829468,
|
|
"eval_runtime": 0.6101,
|
|
"eval_samples_per_second": 288.475,
|
|
"eval_steps_per_second": 9.834,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 54.0,
|
|
"eval_loss": 1.121987223625183,
|
|
"eval_runtime": 0.6079,
|
|
"eval_samples_per_second": 289.54,
|
|
"eval_steps_per_second": 9.871,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 54.55,
|
|
"learning_rate": 4.10352054907785e-05,
|
|
"loss": 1.0547,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 54.91,
|
|
"eval_loss": 1.111721158027649,
|
|
"eval_runtime": 0.6168,
|
|
"eval_samples_per_second": 285.363,
|
|
"eval_steps_per_second": 9.728,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 56.0,
|
|
"eval_loss": 1.1022990942001343,
|
|
"eval_runtime": 0.6113,
|
|
"eval_samples_per_second": 287.898,
|
|
"eval_steps_per_second": 9.815,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 56.36,
|
|
"learning_rate": 4.0405711640023186e-05,
|
|
"loss": 1.0196,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 56.91,
|
|
"eval_loss": 1.0926016569137573,
|
|
"eval_runtime": 0.6138,
|
|
"eval_samples_per_second": 286.748,
|
|
"eval_steps_per_second": 9.776,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 58.0,
|
|
"eval_loss": 1.0840771198272705,
|
|
"eval_runtime": 0.6136,
|
|
"eval_samples_per_second": 286.836,
|
|
"eval_steps_per_second": 9.779,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 58.18,
|
|
"learning_rate": 3.976005932514807e-05,
|
|
"loss": 1.019,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 58.91,
|
|
"eval_loss": 1.078185796737671,
|
|
"eval_runtime": 0.6827,
|
|
"eval_samples_per_second": 257.794,
|
|
"eval_steps_per_second": 8.788,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 60.0,
|
|
"learning_rate": 3.909892574627266e-05,
|
|
"loss": 0.9804,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 60.0,
|
|
"eval_loss": 1.0690256357192993,
|
|
"eval_runtime": 0.6175,
|
|
"eval_samples_per_second": 285.011,
|
|
"eval_steps_per_second": 9.716,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 60.91,
|
|
"eval_loss": 1.0633498430252075,
|
|
"eval_runtime": 0.6096,
|
|
"eval_samples_per_second": 288.703,
|
|
"eval_steps_per_second": 9.842,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 61.82,
|
|
"learning_rate": 3.84230043412246e-05,
|
|
"loss": 0.9693,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 62.0,
|
|
"eval_loss": 1.0553932189941406,
|
|
"eval_runtime": 0.6095,
|
|
"eval_samples_per_second": 288.771,
|
|
"eval_steps_per_second": 9.844,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 62.91,
|
|
"eval_loss": 1.0476597547531128,
|
|
"eval_runtime": 0.6127,
|
|
"eval_samples_per_second": 287.234,
|
|
"eval_steps_per_second": 9.792,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 63.64,
|
|
"learning_rate": 3.773300405821908e-05,
|
|
"loss": 0.959,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 64.0,
|
|
"eval_loss": 1.0392884016036987,
|
|
"eval_runtime": 0.6091,
|
|
"eval_samples_per_second": 288.946,
|
|
"eval_steps_per_second": 9.85,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 64.91,
|
|
"eval_loss": 1.0319526195526123,
|
|
"eval_runtime": 0.6076,
|
|
"eval_samples_per_second": 289.658,
|
|
"eval_steps_per_second": 9.875,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 65.45,
|
|
"learning_rate": 3.702964861227013e-05,
|
|
"loss": 0.934,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 66.0,
|
|
"eval_loss": 1.0237526893615723,
|
|
"eval_runtime": 0.622,
|
|
"eval_samples_per_second": 282.945,
|
|
"eval_steps_per_second": 9.646,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 66.91,
|
|
"eval_loss": 1.0172982215881348,
|
|
"eval_runtime": 0.6161,
|
|
"eval_samples_per_second": 285.649,
|
|
"eval_steps_per_second": 9.738,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 67.27,
|
|
"learning_rate": 3.631367572611348e-05,
|
|
"loss": 0.9182,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 68.0,
|
|
"eval_loss": 1.0093735456466675,
|
|
"eval_runtime": 0.6113,
|
|
"eval_samples_per_second": 287.888,
|
|
"eval_steps_per_second": 9.814,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 68.91,
|
|
"eval_loss": 1.0004040002822876,
|
|
"eval_runtime": 0.618,
|
|
"eval_samples_per_second": 284.808,
|
|
"eval_steps_per_second": 9.709,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 69.09,
|
|
"learning_rate": 3.5585836356437264e-05,
|
|
"loss": 0.9087,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 70.0,
|
|
"eval_loss": 0.9954872727394104,
|
|
"eval_runtime": 0.6104,
|
|
"eval_samples_per_second": 288.329,
|
|
"eval_steps_per_second": 9.829,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 70.91,
|
|
"learning_rate": 3.484689390623218e-05,
|
|
"loss": 0.8923,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 70.91,
|
|
"eval_loss": 0.9893795251846313,
|
|
"eval_runtime": 0.6099,
|
|
"eval_samples_per_second": 288.583,
|
|
"eval_steps_per_second": 9.838,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 72.0,
|
|
"eval_loss": 0.9800569415092468,
|
|
"eval_runtime": 0.6099,
|
|
"eval_samples_per_second": 288.549,
|
|
"eval_steps_per_second": 9.837,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 72.73,
|
|
"learning_rate": 3.409762342408719e-05,
|
|
"loss": 0.8811,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 72.91,
|
|
"eval_loss": 0.9731982946395874,
|
|
"eval_runtime": 0.6102,
|
|
"eval_samples_per_second": 288.43,
|
|
"eval_steps_per_second": 9.833,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 74.0,
|
|
"eval_loss": 0.9653474688529968,
|
|
"eval_runtime": 0.6102,
|
|
"eval_samples_per_second": 288.428,
|
|
"eval_steps_per_second": 9.833,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 74.55,
|
|
"learning_rate": 3.333881079127052e-05,
|
|
"loss": 0.8659,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 74.91,
|
|
"eval_loss": 0.9584926962852478,
|
|
"eval_runtime": 0.6084,
|
|
"eval_samples_per_second": 289.293,
|
|
"eval_steps_per_second": 9.862,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 76.0,
|
|
"eval_loss": 0.9506881237030029,
|
|
"eval_runtime": 0.6093,
|
|
"eval_samples_per_second": 288.844,
|
|
"eval_steps_per_second": 9.847,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 76.36,
|
|
"learning_rate": 3.2571251897448765e-05,
|
|
"loss": 0.8488,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 76.91,
|
|
"eval_loss": 0.9450792074203491,
|
|
"eval_runtime": 0.6111,
|
|
"eval_samples_per_second": 288.027,
|
|
"eval_steps_per_second": 9.819,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 78.0,
|
|
"eval_loss": 0.9382766485214233,
|
|
"eval_runtime": 0.6157,
|
|
"eval_samples_per_second": 285.841,
|
|
"eval_steps_per_second": 9.745,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 78.18,
|
|
"learning_rate": 3.1795751805908573e-05,
|
|
"loss": 0.8388,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 78.91,
|
|
"eval_loss": 0.9338158965110779,
|
|
"eval_runtime": 0.6111,
|
|
"eval_samples_per_second": 288.005,
|
|
"eval_steps_per_second": 9.818,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 80.0,
|
|
"learning_rate": 3.101312390915634e-05,
|
|
"loss": 0.8217,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 80.0,
|
|
"eval_loss": 0.9296420812606812,
|
|
"eval_runtime": 0.6105,
|
|
"eval_samples_per_second": 288.3,
|
|
"eval_steps_per_second": 9.828,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 80.91,
|
|
"eval_loss": 0.9249951839447021,
|
|
"eval_runtime": 0.6141,
|
|
"eval_samples_per_second": 286.62,
|
|
"eval_steps_per_second": 9.771,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 81.82,
|
|
"learning_rate": 3.0224189075781884e-05,
|
|
"loss": 0.8126,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 82.0,
|
|
"eval_loss": 0.9183826446533203,
|
|
"eval_runtime": 0.613,
|
|
"eval_samples_per_second": 287.101,
|
|
"eval_steps_per_second": 9.788,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 82.91,
|
|
"eval_loss": 0.9152975678443909,
|
|
"eval_runtime": 0.616,
|
|
"eval_samples_per_second": 285.699,
|
|
"eval_steps_per_second": 9.74,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 83.64,
|
|
"learning_rate": 2.9429774789480575e-05,
|
|
"loss": 0.7976,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 84.0,
|
|
"eval_loss": 0.9079199433326721,
|
|
"eval_runtime": 0.6122,
|
|
"eval_samples_per_second": 287.479,
|
|
"eval_steps_per_second": 9.8,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 84.91,
|
|
"eval_loss": 0.9032623767852783,
|
|
"eval_runtime": 0.6116,
|
|
"eval_samples_per_second": 287.793,
|
|
"eval_steps_per_second": 9.811,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 85.45,
|
|
"learning_rate": 2.863071428113726e-05,
|
|
"loss": 0.78,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 86.0,
|
|
"eval_loss": 0.8960713744163513,
|
|
"eval_runtime": 0.6103,
|
|
"eval_samples_per_second": 288.379,
|
|
"eval_steps_per_second": 9.831,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 86.91,
|
|
"eval_loss": 0.8901066184043884,
|
|
"eval_runtime": 0.6174,
|
|
"eval_samples_per_second": 285.065,
|
|
"eval_steps_per_second": 9.718,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 87.27,
|
|
"learning_rate": 2.782784565488211e-05,
|
|
"loss": 0.7803,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 88.0,
|
|
"eval_loss": 0.8826420307159424,
|
|
"eval_runtime": 0.6114,
|
|
"eval_samples_per_second": 287.842,
|
|
"eval_steps_per_second": 9.813,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 88.91,
|
|
"eval_loss": 0.8768661022186279,
|
|
"eval_runtime": 0.6139,
|
|
"eval_samples_per_second": 286.679,
|
|
"eval_steps_per_second": 9.773,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 89.09,
|
|
"learning_rate": 2.7022011009035107e-05,
|
|
"loss": 0.7688,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 90.0,
|
|
"eval_loss": 0.8718900680541992,
|
|
"eval_runtime": 0.6108,
|
|
"eval_samples_per_second": 288.158,
|
|
"eval_steps_per_second": 9.824,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 90.91,
|
|
"learning_rate": 2.621405555286121e-05,
|
|
"loss": 0.7613,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 90.91,
|
|
"eval_loss": 0.8661989569664001,
|
|
"eval_runtime": 0.605,
|
|
"eval_samples_per_second": 290.887,
|
|
"eval_steps_per_second": 9.917,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 92.0,
|
|
"eval_loss": 0.860860288143158,
|
|
"eval_runtime": 0.6106,
|
|
"eval_samples_per_second": 288.244,
|
|
"eval_steps_per_second": 9.826,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 92.73,
|
|
"learning_rate": 2.540482672006254e-05,
|
|
"loss": 0.7462,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 92.91,
|
|
"eval_loss": 0.8560842275619507,
|
|
"eval_runtime": 0.6154,
|
|
"eval_samples_per_second": 286.006,
|
|
"eval_steps_per_second": 9.75,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 94.0,
|
|
"eval_loss": 0.8527078032493591,
|
|
"eval_runtime": 0.6118,
|
|
"eval_samples_per_second": 287.658,
|
|
"eval_steps_per_second": 9.807,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 94.55,
|
|
"learning_rate": 2.4595173279937464e-05,
|
|
"loss": 0.734,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 94.91,
|
|
"eval_loss": 0.8490074276924133,
|
|
"eval_runtime": 0.6087,
|
|
"eval_samples_per_second": 289.16,
|
|
"eval_steps_per_second": 9.858,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 96.0,
|
|
"eval_loss": 0.8411813378334045,
|
|
"eval_runtime": 0.676,
|
|
"eval_samples_per_second": 260.366,
|
|
"eval_steps_per_second": 8.876,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 96.36,
|
|
"learning_rate": 2.3785944447138802e-05,
|
|
"loss": 0.7292,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 96.91,
|
|
"eval_loss": 0.8365342617034912,
|
|
"eval_runtime": 0.6903,
|
|
"eval_samples_per_second": 254.96,
|
|
"eval_steps_per_second": 8.692,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 98.0,
|
|
"eval_loss": 0.8290190100669861,
|
|
"eval_runtime": 0.6161,
|
|
"eval_samples_per_second": 285.662,
|
|
"eval_steps_per_second": 9.738,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 98.18,
|
|
"learning_rate": 2.29779889909649e-05,
|
|
"loss": 0.7213,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 98.91,
|
|
"eval_loss": 0.8227143883705139,
|
|
"eval_runtime": 0.662,
|
|
"eval_samples_per_second": 265.867,
|
|
"eval_steps_per_second": 9.064,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 100.0,
|
|
"learning_rate": 2.2172154345117894e-05,
|
|
"loss": 0.7063,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 100.0,
|
|
"eval_loss": 0.8168367743492126,
|
|
"eval_runtime": 0.6934,
|
|
"eval_samples_per_second": 253.815,
|
|
"eval_steps_per_second": 8.653,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 100.91,
|
|
"eval_loss": 0.8129215240478516,
|
|
"eval_runtime": 0.6132,
|
|
"eval_samples_per_second": 287.006,
|
|
"eval_steps_per_second": 9.784,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 101.82,
|
|
"learning_rate": 2.136928571886275e-05,
|
|
"loss": 0.695,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 102.0,
|
|
"eval_loss": 0.8098950982093811,
|
|
"eval_runtime": 0.6132,
|
|
"eval_samples_per_second": 287.032,
|
|
"eval_steps_per_second": 9.785,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 102.91,
|
|
"eval_loss": 0.8075168132781982,
|
|
"eval_runtime": 0.6108,
|
|
"eval_samples_per_second": 288.158,
|
|
"eval_steps_per_second": 9.824,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 103.64,
|
|
"learning_rate": 2.0570225210519434e-05,
|
|
"loss": 0.6827,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 104.0,
|
|
"eval_loss": 0.8030515313148499,
|
|
"eval_runtime": 0.6149,
|
|
"eval_samples_per_second": 286.224,
|
|
"eval_steps_per_second": 9.758,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 104.91,
|
|
"eval_loss": 0.7993787527084351,
|
|
"eval_runtime": 0.6164,
|
|
"eval_samples_per_second": 285.523,
|
|
"eval_steps_per_second": 9.734,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 105.45,
|
|
"learning_rate": 1.9775810924218125e-05,
|
|
"loss": 0.6778,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 106.0,
|
|
"eval_loss": 0.7955842614173889,
|
|
"eval_runtime": 0.6123,
|
|
"eval_samples_per_second": 287.451,
|
|
"eval_steps_per_second": 9.799,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 106.91,
|
|
"eval_loss": 0.7906305193901062,
|
|
"eval_runtime": 0.6081,
|
|
"eval_samples_per_second": 289.432,
|
|
"eval_steps_per_second": 9.867,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 107.27,
|
|
"learning_rate": 1.8986876090843667e-05,
|
|
"loss": 0.6753,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 108.0,
|
|
"eval_loss": 0.7853391766548157,
|
|
"eval_runtime": 0.6095,
|
|
"eval_samples_per_second": 288.773,
|
|
"eval_steps_per_second": 9.845,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 108.91,
|
|
"eval_loss": 0.7820215225219727,
|
|
"eval_runtime": 0.618,
|
|
"eval_samples_per_second": 284.769,
|
|
"eval_steps_per_second": 9.708,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 109.09,
|
|
"learning_rate": 1.820424819409143e-05,
|
|
"loss": 0.6654,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 110.0,
|
|
"eval_loss": 0.7762172222137451,
|
|
"eval_runtime": 0.6112,
|
|
"eval_samples_per_second": 287.94,
|
|
"eval_steps_per_second": 9.816,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 110.91,
|
|
"learning_rate": 1.7428748102551237e-05,
|
|
"loss": 0.6621,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 110.91,
|
|
"eval_loss": 0.7716686725616455,
|
|
"eval_runtime": 0.6158,
|
|
"eval_samples_per_second": 285.825,
|
|
"eval_steps_per_second": 9.744,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 112.0,
|
|
"eval_loss": 0.7677438855171204,
|
|
"eval_runtime": 0.6074,
|
|
"eval_samples_per_second": 289.75,
|
|
"eval_steps_per_second": 9.878,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 112.73,
|
|
"learning_rate": 1.666118920872949e-05,
|
|
"loss": 0.6579,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 112.91,
|
|
"eval_loss": 0.7632172107696533,
|
|
"eval_runtime": 0.6086,
|
|
"eval_samples_per_second": 289.198,
|
|
"eval_steps_per_second": 9.859,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 114.0,
|
|
"eval_loss": 0.760553777217865,
|
|
"eval_runtime": 0.6094,
|
|
"eval_samples_per_second": 288.804,
|
|
"eval_steps_per_second": 9.846,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 114.55,
|
|
"learning_rate": 1.5902376575912815e-05,
|
|
"loss": 0.6412,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 114.91,
|
|
"eval_loss": 0.7592495083808899,
|
|
"eval_runtime": 0.6136,
|
|
"eval_samples_per_second": 286.828,
|
|
"eval_steps_per_second": 9.778,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 116.0,
|
|
"eval_loss": 0.7567878365516663,
|
|
"eval_runtime": 0.6111,
|
|
"eval_samples_per_second": 288.02,
|
|
"eval_steps_per_second": 9.819,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 116.36,
|
|
"learning_rate": 1.5153106093767827e-05,
|
|
"loss": 0.6403,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 116.91,
|
|
"eval_loss": 0.7550404071807861,
|
|
"eval_runtime": 0.6079,
|
|
"eval_samples_per_second": 289.517,
|
|
"eval_steps_per_second": 9.87,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 118.0,
|
|
"eval_loss": 0.7509838342666626,
|
|
"eval_runtime": 0.6082,
|
|
"eval_samples_per_second": 289.365,
|
|
"eval_steps_per_second": 9.865,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 118.18,
|
|
"learning_rate": 1.4414163643562755e-05,
|
|
"loss": 0.6358,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 118.91,
|
|
"eval_loss": 0.7504873871803284,
|
|
"eval_runtime": 0.6087,
|
|
"eval_samples_per_second": 289.117,
|
|
"eval_steps_per_second": 9.856,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 120.0,
|
|
"learning_rate": 1.368632427388653e-05,
|
|
"loss": 0.6328,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 120.0,
|
|
"eval_loss": 0.747511625289917,
|
|
"eval_runtime": 0.6084,
|
|
"eval_samples_per_second": 289.265,
|
|
"eval_steps_per_second": 9.861,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 120.91,
|
|
"eval_loss": 0.7451759576797485,
|
|
"eval_runtime": 0.6218,
|
|
"eval_samples_per_second": 283.031,
|
|
"eval_steps_per_second": 9.649,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 121.82,
|
|
"learning_rate": 1.2970351387729873e-05,
|
|
"loss": 0.6233,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 122.0,
|
|
"eval_loss": 0.7407442331314087,
|
|
"eval_runtime": 0.6125,
|
|
"eval_samples_per_second": 287.333,
|
|
"eval_steps_per_second": 9.795,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 122.91,
|
|
"eval_loss": 0.7380485534667969,
|
|
"eval_runtime": 0.6166,
|
|
"eval_samples_per_second": 285.455,
|
|
"eval_steps_per_second": 9.731,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 123.64,
|
|
"learning_rate": 1.2266995941780934e-05,
|
|
"loss": 0.6101,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 124.0,
|
|
"eval_loss": 0.7349163293838501,
|
|
"eval_runtime": 0.6112,
|
|
"eval_samples_per_second": 287.969,
|
|
"eval_steps_per_second": 9.817,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 124.91,
|
|
"eval_loss": 0.7321335077285767,
|
|
"eval_runtime": 0.6135,
|
|
"eval_samples_per_second": 286.879,
|
|
"eval_steps_per_second": 9.78,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 125.45,
|
|
"learning_rate": 1.1576995658775405e-05,
|
|
"loss": 0.6161,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 126.0,
|
|
"eval_loss": 0.729712724685669,
|
|
"eval_runtime": 0.6087,
|
|
"eval_samples_per_second": 289.133,
|
|
"eval_steps_per_second": 9.857,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 126.91,
|
|
"eval_loss": 0.7279945611953735,
|
|
"eval_runtime": 0.6139,
|
|
"eval_samples_per_second": 286.689,
|
|
"eval_steps_per_second": 9.773,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 127.27,
|
|
"learning_rate": 1.0901074253727336e-05,
|
|
"loss": 0.6099,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 128.0,
|
|
"eval_loss": 0.7247954607009888,
|
|
"eval_runtime": 0.6118,
|
|
"eval_samples_per_second": 287.661,
|
|
"eval_steps_per_second": 9.807,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 128.91,
|
|
"eval_loss": 0.7223231792449951,
|
|
"eval_runtime": 0.6126,
|
|
"eval_samples_per_second": 287.323,
|
|
"eval_steps_per_second": 9.795,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 129.09,
|
|
"learning_rate": 1.0239940674851941e-05,
|
|
"loss": 0.6051,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 130.0,
|
|
"eval_loss": 0.7180494666099548,
|
|
"eval_runtime": 0.6126,
|
|
"eval_samples_per_second": 287.293,
|
|
"eval_steps_per_second": 9.794,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 130.91,
|
|
"learning_rate": 9.594288359976817e-06,
|
|
"loss": 0.6066,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 130.91,
|
|
"eval_loss": 0.7157444357872009,
|
|
"eval_runtime": 0.6097,
|
|
"eval_samples_per_second": 288.682,
|
|
"eval_steps_per_second": 9.841,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 132.0,
|
|
"eval_loss": 0.7136554718017578,
|
|
"eval_runtime": 0.6158,
|
|
"eval_samples_per_second": 285.829,
|
|
"eval_steps_per_second": 9.744,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 132.73,
|
|
"learning_rate": 8.964794509221508e-06,
|
|
"loss": 0.5919,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 132.91,
|
|
"eval_loss": 0.7124893665313721,
|
|
"eval_runtime": 0.6109,
|
|
"eval_samples_per_second": 288.085,
|
|
"eval_steps_per_second": 9.821,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 134.0,
|
|
"eval_loss": 0.7105648517608643,
|
|
"eval_runtime": 0.6159,
|
|
"eval_samples_per_second": 285.767,
|
|
"eval_steps_per_second": 9.742,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 134.55,
|
|
"learning_rate": 8.352119374707978e-06,
|
|
"loss": 0.5947,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 134.91,
|
|
"eval_loss": 0.7095364928245544,
|
|
"eval_runtime": 0.6176,
|
|
"eval_samples_per_second": 284.964,
|
|
"eval_steps_per_second": 9.715,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 136.0,
|
|
"eval_loss": 0.7085996866226196,
|
|
"eval_runtime": 0.609,
|
|
"eval_samples_per_second": 288.985,
|
|
"eval_steps_per_second": 9.852,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 136.36,
|
|
"learning_rate": 7.756905568047393e-06,
|
|
"loss": 0.5847,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 136.91,
|
|
"eval_loss": 0.7068153023719788,
|
|
"eval_runtime": 0.613,
|
|
"eval_samples_per_second": 287.113,
|
|
"eval_steps_per_second": 9.788,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 138.0,
|
|
"eval_loss": 0.704888105392456,
|
|
"eval_runtime": 0.6127,
|
|
"eval_samples_per_second": 287.267,
|
|
"eval_steps_per_second": 9.793,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 138.18,
|
|
"learning_rate": 7.179777386329276e-06,
|
|
"loss": 0.5801,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 138.91,
|
|
"eval_loss": 0.7037582993507385,
|
|
"eval_runtime": 0.614,
|
|
"eval_samples_per_second": 286.667,
|
|
"eval_steps_per_second": 9.773,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 140.0,
|
|
"learning_rate": 6.621340157319997e-06,
|
|
"loss": 0.5777,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 140.0,
|
|
"eval_loss": 0.7029330730438232,
|
|
"eval_runtime": 0.6156,
|
|
"eval_samples_per_second": 285.908,
|
|
"eval_steps_per_second": 9.747,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 140.91,
|
|
"eval_loss": 0.702335774898529,
|
|
"eval_runtime": 0.6801,
|
|
"eval_samples_per_second": 258.786,
|
|
"eval_steps_per_second": 8.822,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 141.82,
|
|
"learning_rate": 6.082179604557617e-06,
|
|
"loss": 0.5775,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 142.0,
|
|
"eval_loss": 0.701381266117096,
|
|
"eval_runtime": 0.6168,
|
|
"eval_samples_per_second": 285.34,
|
|
"eval_steps_per_second": 9.728,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 142.91,
|
|
"eval_loss": 0.6989457011222839,
|
|
"eval_runtime": 0.6106,
|
|
"eval_samples_per_second": 288.22,
|
|
"eval_steps_per_second": 9.826,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 143.64,
|
|
"learning_rate": 5.562861233008774e-06,
|
|
"loss": 0.5766,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 144.0,
|
|
"eval_loss": 0.696780264377594,
|
|
"eval_runtime": 0.6081,
|
|
"eval_samples_per_second": 289.427,
|
|
"eval_steps_per_second": 9.867,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 144.91,
|
|
"eval_loss": 0.6954607367515564,
|
|
"eval_runtime": 0.6121,
|
|
"eval_samples_per_second": 287.553,
|
|
"eval_steps_per_second": 9.803,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 145.45,
|
|
"learning_rate": 5.063929735931985e-06,
|
|
"loss": 0.5799,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 146.0,
|
|
"eval_loss": 0.694266140460968,
|
|
"eval_runtime": 0.6134,
|
|
"eval_samples_per_second": 286.919,
|
|
"eval_steps_per_second": 9.781,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 146.91,
|
|
"eval_loss": 0.6934513449668884,
|
|
"eval_runtime": 0.6125,
|
|
"eval_samples_per_second": 287.341,
|
|
"eval_steps_per_second": 9.796,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 147.27,
|
|
"learning_rate": 4.585908423569724e-06,
|
|
"loss": 0.5724,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 148.0,
|
|
"eval_loss": 0.6923357248306274,
|
|
"eval_runtime": 0.6116,
|
|
"eval_samples_per_second": 287.776,
|
|
"eval_steps_per_second": 9.811,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 148.91,
|
|
"eval_loss": 0.6916863918304443,
|
|
"eval_runtime": 0.6209,
|
|
"eval_samples_per_second": 283.459,
|
|
"eval_steps_per_second": 9.663,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 149.09,
|
|
"learning_rate": 4.129298674268225e-06,
|
|
"loss": 0.5636,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 150.0,
|
|
"eval_loss": 0.6903253197669983,
|
|
"eval_runtime": 0.6129,
|
|
"eval_samples_per_second": 287.181,
|
|
"eval_steps_per_second": 9.79,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 150.91,
|
|
"learning_rate": 3.694579408600771e-06,
|
|
"loss": 0.5718,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 150.91,
|
|
"eval_loss": 0.6895003318786621,
|
|
"eval_runtime": 0.612,
|
|
"eval_samples_per_second": 287.574,
|
|
"eval_steps_per_second": 9.804,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 152.0,
|
|
"eval_loss": 0.6892096996307373,
|
|
"eval_runtime": 0.6102,
|
|
"eval_samples_per_second": 288.427,
|
|
"eval_steps_per_second": 9.833,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 152.73,
|
|
"learning_rate": 3.2822065870462217e-06,
|
|
"loss": 0.5656,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 152.91,
|
|
"eval_loss": 0.6885128617286682,
|
|
"eval_runtime": 0.6079,
|
|
"eval_samples_per_second": 289.516,
|
|
"eval_steps_per_second": 9.87,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 154.0,
|
|
"eval_loss": 0.6882739067077637,
|
|
"eval_runtime": 0.6116,
|
|
"eval_samples_per_second": 287.773,
|
|
"eval_steps_per_second": 9.81,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 154.55,
|
|
"learning_rate": 2.892612731749414e-06,
|
|
"loss": 0.5642,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 154.91,
|
|
"eval_loss": 0.6874551773071289,
|
|
"eval_runtime": 0.6157,
|
|
"eval_samples_per_second": 285.833,
|
|
"eval_steps_per_second": 9.744,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 156.0,
|
|
"eval_loss": 0.6862676739692688,
|
|
"eval_runtime": 0.6125,
|
|
"eval_samples_per_second": 287.37,
|
|
"eval_steps_per_second": 9.797,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 156.36,
|
|
"learning_rate": 2.52620647286512e-06,
|
|
"loss": 0.5681,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 156.91,
|
|
"eval_loss": 0.6856226325035095,
|
|
"eval_runtime": 0.6086,
|
|
"eval_samples_per_second": 289.209,
|
|
"eval_steps_per_second": 9.859,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 158.0,
|
|
"eval_loss": 0.6847879886627197,
|
|
"eval_runtime": 0.6112,
|
|
"eval_samples_per_second": 287.948,
|
|
"eval_steps_per_second": 9.816,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 158.18,
|
|
"learning_rate": 2.183372119961499e-06,
|
|
"loss": 0.5618,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 158.91,
|
|
"eval_loss": 0.6842586398124695,
|
|
"eval_runtime": 0.6143,
|
|
"eval_samples_per_second": 286.51,
|
|
"eval_steps_per_second": 9.767,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 160.0,
|
|
"learning_rate": 1.864469258932397e-06,
|
|
"loss": 0.5485,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 160.0,
|
|
"eval_loss": 0.6837961077690125,
|
|
"eval_runtime": 0.6085,
|
|
"eval_samples_per_second": 289.259,
|
|
"eval_steps_per_second": 9.861,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 160.91,
|
|
"eval_loss": 0.6834523677825928,
|
|
"eval_runtime": 0.609,
|
|
"eval_samples_per_second": 288.988,
|
|
"eval_steps_per_second": 9.852,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 161.82,
|
|
"learning_rate": 1.5698323748414124e-06,
|
|
"loss": 0.5611,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 162.0,
|
|
"eval_loss": 0.6828814744949341,
|
|
"eval_runtime": 0.6118,
|
|
"eval_samples_per_second": 287.681,
|
|
"eval_steps_per_second": 9.807,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 162.91,
|
|
"eval_loss": 0.6825764775276184,
|
|
"eval_runtime": 0.6147,
|
|
"eval_samples_per_second": 286.328,
|
|
"eval_steps_per_second": 9.761,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 163.64,
|
|
"learning_rate": 1.2997705010932393e-06,
|
|
"loss": 0.5552,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 164.0,
|
|
"eval_loss": 0.6823921203613281,
|
|
"eval_runtime": 0.6119,
|
|
"eval_samples_per_second": 287.631,
|
|
"eval_steps_per_second": 9.806,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 164.91,
|
|
"eval_loss": 0.6820687651634216,
|
|
"eval_runtime": 0.6085,
|
|
"eval_samples_per_second": 289.234,
|
|
"eval_steps_per_second": 9.86,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 165.45,
|
|
"learning_rate": 1.0545668953003241e-06,
|
|
"loss": 0.5675,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 166.0,
|
|
"eval_loss": 0.6815680861473083,
|
|
"eval_runtime": 0.6088,
|
|
"eval_samples_per_second": 289.108,
|
|
"eval_steps_per_second": 9.856,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 166.91,
|
|
"eval_loss": 0.6814342141151428,
|
|
"eval_runtime": 0.6085,
|
|
"eval_samples_per_second": 289.246,
|
|
"eval_steps_per_second": 9.861,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 167.27,
|
|
"learning_rate": 8.344787421847217e-07,
|
|
"loss": 0.5586,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 168.0,
|
|
"eval_loss": 0.6812211871147156,
|
|
"eval_runtime": 0.6108,
|
|
"eval_samples_per_second": 288.154,
|
|
"eval_steps_per_second": 9.823,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 168.91,
|
|
"eval_loss": 0.6808667778968811,
|
|
"eval_runtime": 0.6126,
|
|
"eval_samples_per_second": 287.282,
|
|
"eval_steps_per_second": 9.794,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 169.09,
|
|
"learning_rate": 6.397368838268497e-07,
|
|
"loss": 0.5577,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 170.0,
|
|
"eval_loss": 0.6807306408882141,
|
|
"eval_runtime": 0.6126,
|
|
"eval_samples_per_second": 287.3,
|
|
"eval_steps_per_second": 9.794,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 170.91,
|
|
"learning_rate": 4.7054557754402373e-07,
|
|
"loss": 0.5487,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 170.91,
|
|
"eval_loss": 0.6806881427764893,
|
|
"eval_runtime": 0.6111,
|
|
"eval_samples_per_second": 288.019,
|
|
"eval_steps_per_second": 9.819,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 172.0,
|
|
"eval_loss": 0.6806466579437256,
|
|
"eval_runtime": 0.6156,
|
|
"eval_samples_per_second": 285.889,
|
|
"eval_steps_per_second": 9.746,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 172.73,
|
|
"learning_rate": 3.270822816527325e-07,
|
|
"loss": 0.5532,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 172.91,
|
|
"eval_loss": 0.6805295348167419,
|
|
"eval_runtime": 0.6098,
|
|
"eval_samples_per_second": 288.61,
|
|
"eval_steps_per_second": 9.839,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 174.0,
|
|
"eval_loss": 0.6804378628730774,
|
|
"eval_runtime": 0.6144,
|
|
"eval_samples_per_second": 286.48,
|
|
"eval_steps_per_second": 9.766,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 174.55,
|
|
"learning_rate": 2.094974693393731e-07,
|
|
"loss": 0.5601,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 174.91,
|
|
"eval_loss": 0.6803626418113708,
|
|
"eval_runtime": 0.6093,
|
|
"eval_samples_per_second": 288.837,
|
|
"eval_steps_per_second": 9.847,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 176.0,
|
|
"eval_loss": 0.6802287101745605,
|
|
"eval_runtime": 0.6105,
|
|
"eval_samples_per_second": 288.293,
|
|
"eval_steps_per_second": 9.828,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 176.36,
|
|
"learning_rate": 1.1791447083465134e-07,
|
|
"loss": 0.5558,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 176.91,
|
|
"eval_loss": 0.6801658272743225,
|
|
"eval_runtime": 0.6151,
|
|
"eval_samples_per_second": 286.123,
|
|
"eval_steps_per_second": 9.754,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 178.0,
|
|
"eval_loss": 0.6801168322563171,
|
|
"eval_runtime": 0.6101,
|
|
"eval_samples_per_second": 288.471,
|
|
"eval_steps_per_second": 9.834,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 178.18,
|
|
"learning_rate": 5.242934405720879e-08,
|
|
"loss": 0.5601,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 178.91,
|
|
"eval_loss": 0.6801341772079468,
|
|
"eval_runtime": 0.6154,
|
|
"eval_samples_per_second": 285.986,
|
|
"eval_steps_per_second": 9.75,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 180.0,
|
|
"learning_rate": 1.3110773862126669e-08,
|
|
"loss": 0.5612,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 180.0,
|
|
"eval_loss": 0.680082380771637,
|
|
"eval_runtime": 0.6089,
|
|
"eval_samples_per_second": 289.031,
|
|
"eval_steps_per_second": 9.853,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 180.91,
|
|
"eval_loss": 0.6800580620765686,
|
|
"eval_runtime": 0.6814,
|
|
"eval_samples_per_second": 258.31,
|
|
"eval_steps_per_second": 8.806,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 181.82,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.5508,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 181.82,
|
|
"eval_loss": 0.6800865530967712,
|
|
"eval_runtime": 0.6303,
|
|
"eval_samples_per_second": 279.23,
|
|
"eval_steps_per_second": 9.519,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 181.82,
|
|
"step": 1000,
|
|
"total_flos": 1.2949054332736635e+18,
|
|
"train_loss": 0.8898317427635193,
|
|
"train_runtime": 1547.2406,
|
|
"train_samples_per_second": 90.871,
|
|
"train_steps_per_second": 0.646
|
|
}
|
|
],
|
|
"max_steps": 1000,
|
|
"num_train_epochs": 200,
|
|
"total_flos": 1.2949054332736635e+18,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|