diff --git a/llama2_wenan_qlora_50e/README.md b/llama2_wenan_qlora_50e/README.md new file mode 100644 index 0000000..2c4d262 --- /dev/null +++ b/llama2_wenan_qlora_50e/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: /home/sdk_models/llama2-7b-hf/ +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/llama2_wenan_qlora_50e/adapter_config.json b/llama2_wenan_qlora_50e/adapter_config.json new file mode 100644 index 0000000..bee0b91 --- /dev/null +++ b/llama2_wenan_qlora_50e/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/sdk_models/llama2-7b-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llama2_wenan_qlora_50e/adapter_model.bin b/llama2_wenan_qlora_50e/adapter_model.bin new file mode 100644 index 0000000..df3a010 Binary files /dev/null and b/llama2_wenan_qlora_50e/adapter_model.bin differ diff --git a/llama2_wenan_qlora_50e/all_results.json b/llama2_wenan_qlora_50e/all_results.json new file mode 100644 index 0000000..64379d1 --- /dev/null +++ b/llama2_wenan_qlora_50e/all_results.json @@ -0,0 +1,11 @@ +{ + "epoch": 49.48, + "eval_loss": 1.4058780670166016, + "eval_runtime": 3.3088, + "eval_samples_per_second": 232.711, + "eval_steps_per_second": 7.556, + "train_loss": 1.4999438818295796, + "train_runtime": 2257.7963, + "train_samples_per_second": 68.164, + "train_steps_per_second": 0.531 +} \ No newline at end of file diff --git a/llama2_wenan_qlora_50e/eval_results.json b/llama2_wenan_qlora_50e/eval_results.json new file mode 100644 index 0000000..82bec31 --- /dev/null +++ b/llama2_wenan_qlora_50e/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 49.48, + "eval_loss": 1.4058780670166016, + "eval_runtime": 3.3088, + "eval_samples_per_second": 232.711, + "eval_steps_per_second": 7.556 +} \ No newline at end of file diff --git a/llama2_wenan_qlora_50e/train_results.json b/llama2_wenan_qlora_50e/train_results.json new file mode 100644 index 0000000..dc390cc --- /dev/null +++ b/llama2_wenan_qlora_50e/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 49.48, + "train_loss": 1.4999438818295796, + "train_runtime": 2257.7963, + "train_samples_per_second": 68.164, + "train_steps_per_second": 0.531 +} \ No newline at end of file diff --git a/llama2_wenan_qlora_50e/trainer_state.json b/llama2_wenan_qlora_50e/trainer_state.json new file mode 100644 index 0000000..4d61a28 --- /dev/null +++ b/llama2_wenan_qlora_50e/trainer_state.json @@ -0,0 +1,1145 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 49.48453608247423, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.41, + "learning_rate": 1.388888888888889e-05, + "loss": 2.3154, + "step": 10 + }, + { + "epoch": 0.82, + "learning_rate": 2.777777777777778e-05, + "loss": 2.2935, + "step": 20 + }, + { + "epoch": 0.99, + "eval_loss": 2.2255096435546875, + "eval_runtime": 3.3098, + "eval_samples_per_second": 232.639, + "eval_steps_per_second": 7.553, + "step": 24 + }, + { + "epoch": 1.24, + "learning_rate": 4.166666666666667e-05, + "loss": 2.2031, + "step": 30 + }, + { + "epoch": 1.65, + "learning_rate": 4.999854313415309e-05, + "loss": 2.1121, + "step": 40 + }, + { + "epoch": 1.98, + "eval_loss": 2.0086605548858643, + "eval_runtime": 3.3338, + "eval_samples_per_second": 230.965, + "eval_steps_per_second": 7.499, + "step": 48 + }, + { + "epoch": 2.06, + "learning_rate": 4.9982155343321184e-05, + "loss": 2.0381, + "step": 50 + }, + { + "epoch": 2.47, + "learning_rate": 4.9947570655942796e-05, + "loss": 1.9515, + "step": 60 + }, + { + "epoch": 2.89, + "learning_rate": 4.989481426335828e-05, + "loss": 1.8964, + "step": 70 + }, + { + "epoch": 2.97, + "eval_loss": 1.8894507884979248, + "eval_runtime": 3.3135, + "eval_samples_per_second": 232.382, + "eval_steps_per_second": 7.545, + "step": 72 + }, + { + "epoch": 3.3, + "learning_rate": 4.982392459310141e-05, + "loss": 1.8723, + "step": 80 + }, + { + "epoch": 3.71, + "learning_rate": 4.9734953280908904e-05, + "loss": 1.8184, + "step": 90 + }, + { + "epoch": 4.0, + "eval_loss": 1.807281732559204, + "eval_runtime": 3.3174, + "eval_samples_per_second": 232.107, + "eval_steps_per_second": 7.536, + "step": 97 + }, + { + "epoch": 4.12, + "learning_rate": 4.9627965133109165e-05, + "loss": 1.8065, + "step": 100 + }, + { + "epoch": 4.54, + "learning_rate": 4.950303807941764e-05, + "loss": 1.7479, + "step": 110 + }, + { + "epoch": 4.95, + "learning_rate": 4.936026311617316e-05, + "loss": 1.7511, + "step": 120 + }, + { + "epoch": 4.99, + "eval_loss": 1.7449322938919067, + "eval_runtime": 3.3418, + "eval_samples_per_second": 230.412, + "eval_steps_per_second": 7.481, + "step": 121 + }, + { + "epoch": 5.36, + "learning_rate": 4.919974424005652e-05, + "loss": 1.7217, + "step": 130 + }, + { + "epoch": 5.77, + "learning_rate": 4.902159837233985e-05, + "loss": 1.7203, + "step": 140 + }, + { + "epoch": 5.98, + "eval_loss": 1.7159528732299805, + "eval_runtime": 3.3301, + "eval_samples_per_second": 231.221, + "eval_steps_per_second": 7.507, + "step": 145 + }, + { + "epoch": 6.19, + "learning_rate": 4.882595527372152e-05, + "loss": 1.7072, + "step": 150 + }, + { + "epoch": 6.6, + "learning_rate": 4.8612957449809135e-05, + "loss": 1.6995, + "step": 160 + }, + { + "epoch": 6.97, + "eval_loss": 1.6927465200424194, + "eval_runtime": 3.3065, + "eval_samples_per_second": 232.872, + "eval_steps_per_second": 7.561, + "step": 169 + }, + { + "epoch": 7.01, + "learning_rate": 4.838276004731892e-05, + "loss": 1.684, + "step": 170 + }, + { + "epoch": 7.42, + "learning_rate": 4.813553074106761e-05, + "loss": 1.671, + "step": 180 + }, + { + "epoch": 7.84, + "learning_rate": 4.787144961183874e-05, + "loss": 1.6634, + "step": 190 + }, + { + "epoch": 8.0, + "eval_loss": 1.6678295135498047, + "eval_runtime": 3.3139, + "eval_samples_per_second": 232.354, + "eval_steps_per_second": 7.544, + "step": 194 + }, + { + "epoch": 8.25, + "learning_rate": 4.759070901521263e-05, + "loss": 1.6528, + "step": 200 + }, + { + "epoch": 8.66, + "learning_rate": 4.7293513441455364e-05, + "loss": 1.6334, + "step": 210 + }, + { + "epoch": 8.99, + "eval_loss": 1.6490652561187744, + "eval_runtime": 3.3203, + "eval_samples_per_second": 231.91, + "eval_steps_per_second": 7.53, + "step": 218 + }, + { + "epoch": 9.07, + "learning_rate": 4.698007936656891e-05, + "loss": 1.636, + "step": 220 + }, + { + "epoch": 9.48, + "learning_rate": 4.665063509461097e-05, + "loss": 1.6254, + "step": 230 + }, + { + "epoch": 9.9, + "learning_rate": 4.630542059139924e-05, + "loss": 1.6115, + "step": 240 + }, + { + "epoch": 9.98, + "eval_loss": 1.6315476894378662, + "eval_runtime": 3.326, + "eval_samples_per_second": 231.509, + "eval_steps_per_second": 7.517, + "step": 242 + }, + { + "epoch": 10.31, + "learning_rate": 4.59446873097213e-05, + "loss": 1.5838, + "step": 250 + }, + { + "epoch": 10.72, + "learning_rate": 4.556869800617754e-05, + "loss": 1.6077, + "step": 260 + }, + { + "epoch": 10.97, + "eval_loss": 1.6146764755249023, + "eval_runtime": 3.3121, + "eval_samples_per_second": 232.48, + "eval_steps_per_second": 7.548, + "step": 266 + }, + { + "epoch": 11.13, + "learning_rate": 4.517772654979023e-05, + "loss": 1.6012, + "step": 270 + }, + { + "epoch": 11.55, + "learning_rate": 4.4772057722518643e-05, + "loss": 1.5915, + "step": 280 + }, + { + "epoch": 11.96, + "learning_rate": 4.435198701182492e-05, + "loss": 1.5884, + "step": 290 + }, + { + "epoch": 12.0, + "eval_loss": 1.5998302698135376, + "eval_runtime": 3.3303, + "eval_samples_per_second": 231.208, + "eval_steps_per_second": 7.507, + "step": 291 + }, + { + "epoch": 12.37, + "learning_rate": 4.391782039544238e-05, + "loss": 1.5503, + "step": 300 + }, + { + "epoch": 12.78, + "learning_rate": 4.346987411850253e-05, + "loss": 1.5488, + "step": 310 + }, + { + "epoch": 12.99, + "eval_loss": 1.582898736000061, + "eval_runtime": 3.3255, + "eval_samples_per_second": 231.544, + "eval_steps_per_second": 7.518, + "step": 315 + }, + { + "epoch": 13.2, + "learning_rate": 4.30084744631835e-05, + "loss": 1.5662, + "step": 320 + }, + { + "epoch": 13.61, + "learning_rate": 4.253395751104748e-05, + "loss": 1.5438, + "step": 330 + }, + { + "epoch": 13.98, + "eval_loss": 1.5704370737075806, + "eval_runtime": 3.3233, + "eval_samples_per_second": 231.694, + "eval_steps_per_second": 7.523, + "step": 339 + }, + { + "epoch": 14.02, + "learning_rate": 4.20466688982403e-05, + "loss": 1.5412, + "step": 340 + }, + { + "epoch": 14.43, + "learning_rate": 4.154696356373154e-05, + "loss": 1.535, + "step": 350 + }, + { + "epoch": 14.85, + "learning_rate": 4.10352054907785e-05, + "loss": 1.5429, + "step": 360 + }, + { + "epoch": 14.97, + "eval_loss": 1.5577681064605713, + "eval_runtime": 3.3105, + "eval_samples_per_second": 232.594, + "eval_steps_per_second": 7.552, + "step": 363 + }, + { + "epoch": 15.26, + "learning_rate": 4.051176744180227e-05, + "loss": 1.515, + "step": 370 + }, + { + "epoch": 15.67, + "learning_rate": 3.997703068686923e-05, + "loss": 1.5198, + "step": 380 + }, + { + "epoch": 16.0, + "eval_loss": 1.54729163646698, + "eval_runtime": 3.3137, + "eval_samples_per_second": 232.37, + "eval_steps_per_second": 7.544, + "step": 388 + }, + { + "epoch": 16.08, + "learning_rate": 3.943138472597549e-05, + "loss": 1.5096, + "step": 390 + }, + { + "epoch": 16.49, + "learning_rate": 3.887522700533675e-05, + "loss": 1.5052, + "step": 400 + }, + { + "epoch": 16.91, + "learning_rate": 3.8308962627890174e-05, + "loss": 1.4901, + "step": 410 + }, + { + "epoch": 16.99, + "eval_loss": 1.5360088348388672, + "eval_runtime": 3.3236, + "eval_samples_per_second": 231.68, + "eval_steps_per_second": 7.522, + "step": 412 + }, + { + "epoch": 17.32, + "learning_rate": 3.773300405821908e-05, + "loss": 1.5038, + "step": 420 + }, + { + "epoch": 17.73, + "learning_rate": 3.714777082211551e-05, + "loss": 1.5092, + "step": 430 + }, + { + "epoch": 17.98, + "eval_loss": 1.5263161659240723, + "eval_runtime": 3.332, + "eval_samples_per_second": 231.089, + "eval_steps_per_second": 7.503, + "step": 436 + }, + { + "epoch": 18.14, + "learning_rate": 3.6553689200999426e-05, + "loss": 1.481, + "step": 440 + }, + { + "epoch": 18.56, + "learning_rate": 3.595119192141706e-05, + "loss": 1.483, + "step": 450 + }, + { + "epoch": 18.97, + "learning_rate": 3.534071783984479e-05, + "loss": 1.4879, + "step": 460 + }, + { + "epoch": 18.97, + "eval_loss": 1.518540382385254, + "eval_runtime": 3.3283, + "eval_samples_per_second": 231.351, + "eval_steps_per_second": 7.511, + "step": 460 + }, + { + "epoch": 19.38, + "learning_rate": 3.472271162302789e-05, + "loss": 1.4647, + "step": 470 + }, + { + "epoch": 19.79, + "learning_rate": 3.409762342408719e-05, + "loss": 1.4799, + "step": 480 + }, + { + "epoch": 20.0, + "eval_loss": 1.5089068412780762, + "eval_runtime": 3.6806, + "eval_samples_per_second": 209.206, + "eval_steps_per_second": 6.792, + "step": 485 + }, + { + "epoch": 20.21, + "learning_rate": 3.346590855462939e-05, + "loss": 1.4678, + "step": 490 + }, + { + "epoch": 20.62, + "learning_rate": 3.2828027153100065e-05, + "loss": 1.4658, + "step": 500 + }, + { + "epoch": 20.99, + "eval_loss": 1.498407006263733, + "eval_runtime": 3.3262, + "eval_samples_per_second": 231.494, + "eval_steps_per_second": 7.516, + "step": 509 + }, + { + "epoch": 21.03, + "learning_rate": 3.218444384962071e-05, + "loss": 1.4547, + "step": 510 + }, + { + "epoch": 21.44, + "learning_rate": 3.153562742755414e-05, + "loss": 1.4633, + "step": 520 + }, + { + "epoch": 21.86, + "learning_rate": 3.088205048204469e-05, + "loss": 1.4442, + "step": 530 + }, + { + "epoch": 21.98, + "eval_loss": 1.489104986190796, + "eval_runtime": 3.323, + "eval_samples_per_second": 231.719, + "eval_steps_per_second": 7.523, + "step": 533 + }, + { + "epoch": 22.27, + "learning_rate": 3.0224189075781884e-05, + "loss": 1.4345, + "step": 540 + }, + { + "epoch": 22.68, + "learning_rate": 2.9562522392238346e-05, + "loss": 1.4434, + "step": 550 + }, + { + "epoch": 22.97, + "eval_loss": 1.482366681098938, + "eval_runtime": 3.3203, + "eval_samples_per_second": 231.906, + "eval_steps_per_second": 7.529, + "step": 557 + }, + { + "epoch": 23.09, + "learning_rate": 2.8897532386634663e-05, + "loss": 1.4529, + "step": 560 + }, + { + "epoch": 23.51, + "learning_rate": 2.8229703434885163e-05, + "loss": 1.4334, + "step": 570 + }, + { + "epoch": 23.92, + "learning_rate": 2.7559521980780568e-05, + "loss": 1.4423, + "step": 580 + }, + { + "epoch": 24.0, + "eval_loss": 1.474360466003418, + "eval_runtime": 3.3291, + "eval_samples_per_second": 231.294, + "eval_steps_per_second": 7.51, + "step": 582 + }, + { + "epoch": 24.33, + "learning_rate": 2.68874761816644e-05, + "loss": 1.4187, + "step": 590 + }, + { + "epoch": 24.74, + "learning_rate": 2.621405555286121e-05, + "loss": 1.4299, + "step": 600 + }, + { + "epoch": 24.99, + "eval_loss": 1.4700332880020142, + "eval_runtime": 3.3237, + "eval_samples_per_second": 231.671, + "eval_steps_per_second": 7.522, + "step": 606 + }, + { + "epoch": 25.15, + "learning_rate": 2.5539750611115697e-05, + "loss": 1.4282, + "step": 610 + }, + { + "epoch": 25.57, + "learning_rate": 2.4865052517302396e-05, + "loss": 1.4229, + "step": 620 + }, + { + "epoch": 25.98, + "learning_rate": 2.419045271866611e-05, + "loss": 1.4034, + "step": 630 + }, + { + "epoch": 25.98, + "eval_loss": 1.4617327451705933, + "eval_runtime": 3.3167, + "eval_samples_per_second": 232.159, + "eval_steps_per_second": 7.538, + "step": 630 + }, + { + "epoch": 26.39, + "learning_rate": 2.351644259085387e-05, + "loss": 1.4082, + "step": 640 + }, + { + "epoch": 26.8, + "learning_rate": 2.2843513079998983e-05, + "loss": 1.4161, + "step": 650 + }, + { + "epoch": 26.97, + "eval_loss": 1.4545296430587769, + "eval_runtime": 3.3224, + "eval_samples_per_second": 231.76, + "eval_steps_per_second": 7.525, + "step": 654 + }, + { + "epoch": 27.22, + "learning_rate": 2.2172154345117894e-05, + "loss": 1.4092, + "step": 660 + }, + { + "epoch": 27.63, + "learning_rate": 2.1502855401080482e-05, + "loss": 1.3839, + "step": 670 + }, + { + "epoch": 28.0, + "eval_loss": 1.4493850469589233, + "eval_runtime": 3.6643, + "eval_samples_per_second": 210.135, + "eval_steps_per_second": 6.823, + "step": 679 + }, + { + "epoch": 28.04, + "learning_rate": 2.0836103762413638e-05, + "loss": 1.3922, + "step": 680 + }, + { + "epoch": 28.45, + "learning_rate": 2.0172385088197803e-05, + "loss": 1.3896, + "step": 690 + }, + { + "epoch": 28.87, + "learning_rate": 1.9512182828314885e-05, + "loss": 1.4081, + "step": 700 + }, + { + "epoch": 28.99, + "eval_loss": 1.4452208280563354, + "eval_runtime": 3.3261, + "eval_samples_per_second": 231.505, + "eval_steps_per_second": 7.516, + "step": 703 + }, + { + "epoch": 29.28, + "learning_rate": 1.885597787130542e-05, + "loss": 1.3947, + "step": 710 + }, + { + "epoch": 29.69, + "learning_rate": 1.820424819409143e-05, + "loss": 1.3918, + "step": 720 + }, + { + "epoch": 29.98, + "eval_loss": 1.439475655555725, + "eval_runtime": 3.3169, + "eval_samples_per_second": 232.145, + "eval_steps_per_second": 7.537, + "step": 727 + }, + { + "epoch": 30.1, + "learning_rate": 1.7557468513819993e-05, + "loss": 1.3937, + "step": 730 + }, + { + "epoch": 30.52, + "learning_rate": 1.6916109942081293e-05, + "loss": 1.3749, + "step": 740 + }, + { + "epoch": 30.93, + "learning_rate": 1.6280639641752942e-05, + "loss": 1.3788, + "step": 750 + }, + { + "epoch": 30.97, + "eval_loss": 1.4356813430786133, + "eval_runtime": 3.3451, + "eval_samples_per_second": 230.187, + "eval_steps_per_second": 7.474, + "step": 751 + }, + { + "epoch": 31.34, + "learning_rate": 1.5651520486720516e-05, + "loss": 1.3907, + "step": 760 + }, + { + "epoch": 31.75, + "learning_rate": 1.5029210724722126e-05, + "loss": 1.3819, + "step": 770 + }, + { + "epoch": 32.0, + "eval_loss": 1.4313223361968994, + "eval_runtime": 3.3114, + "eval_samples_per_second": 232.53, + "eval_steps_per_second": 7.55, + "step": 776 + }, + { + "epoch": 32.16, + "learning_rate": 1.4414163643562755e-05, + "loss": 1.3903, + "step": 780 + }, + { + "epoch": 32.58, + "learning_rate": 1.3806827240941265e-05, + "loss": 1.3868, + "step": 790 + }, + { + "epoch": 32.99, + "learning_rate": 1.3207643898130853e-05, + "loss": 1.3813, + "step": 800 + }, + { + "epoch": 32.99, + "eval_loss": 1.428261160850525, + "eval_runtime": 3.3143, + "eval_samples_per_second": 232.327, + "eval_steps_per_second": 7.543, + "step": 800 + }, + { + "epoch": 33.4, + "learning_rate": 1.2617050057750322e-05, + "loss": 1.3607, + "step": 810 + }, + { + "epoch": 33.81, + "learning_rate": 1.2035475905861136e-05, + "loss": 1.3785, + "step": 820 + }, + { + "epoch": 33.98, + "eval_loss": 1.4251344203948975, + "eval_runtime": 3.3123, + "eval_samples_per_second": 232.467, + "eval_steps_per_second": 7.548, + "step": 824 + }, + { + "epoch": 34.23, + "learning_rate": 1.1463345058621755e-05, + "loss": 1.3555, + "step": 830 + }, + { + "epoch": 34.64, + "learning_rate": 1.0901074253727336e-05, + "loss": 1.3825, + "step": 840 + }, + { + "epoch": 34.97, + "eval_loss": 1.421394944190979, + "eval_runtime": 3.3263, + "eval_samples_per_second": 231.487, + "eval_steps_per_second": 7.516, + "step": 848 + }, + { + "epoch": 35.05, + "learning_rate": 1.0349073046859828e-05, + "loss": 1.3658, + "step": 850 + }, + { + "epoch": 35.46, + "learning_rate": 9.807743513369272e-06, + "loss": 1.3579, + "step": 860 + }, + { + "epoch": 35.88, + "learning_rate": 9.277479955403887e-06, + "loss": 1.3898, + "step": 870 + }, + { + "epoch": 36.0, + "eval_loss": 1.41862952709198, + "eval_runtime": 3.3199, + "eval_samples_per_second": 231.934, + "eval_steps_per_second": 7.53, + "step": 873 + }, + { + "epoch": 36.29, + "learning_rate": 8.758668614701973e-06, + "loss": 1.3586, + "step": 880 + }, + { + "epoch": 36.7, + "learning_rate": 8.251687391255117e-06, + "loss": 1.3616, + "step": 890 + }, + { + "epoch": 36.99, + "eval_loss": 1.4160585403442383, + "eval_runtime": 3.3202, + "eval_samples_per_second": 231.91, + "eval_steps_per_second": 7.53, + "step": 897 + }, + { + "epoch": 37.11, + "learning_rate": 7.756905568047393e-06, + "loss": 1.3792, + "step": 900 + }, + { + "epoch": 37.53, + "learning_rate": 7.274683542071242e-06, + "loss": 1.3662, + "step": 910 + }, + { + "epoch": 37.94, + "learning_rate": 6.805372561815767e-06, + "loss": 1.3564, + "step": 920 + }, + { + "epoch": 37.98, + "eval_loss": 1.4149819612503052, + "eval_runtime": 3.3197, + "eval_samples_per_second": 231.949, + "eval_steps_per_second": 7.531, + "step": 921 + }, + { + "epoch": 38.35, + "learning_rate": 6.349314471418849e-06, + "loss": 1.3607, + "step": 930 + }, + { + "epoch": 38.76, + "learning_rate": 5.906841461669327e-06, + "loss": 1.3575, + "step": 940 + }, + { + "epoch": 38.97, + "eval_loss": 1.4127955436706543, + "eval_runtime": 3.33, + "eval_samples_per_second": 231.231, + "eval_steps_per_second": 7.507, + "step": 945 + }, + { + "epoch": 39.18, + "learning_rate": 5.4782758280406e-06, + "loss": 1.3486, + "step": 950 + }, + { + "epoch": 39.59, + "learning_rate": 5.063929735931985e-06, + "loss": 1.366, + "step": 960 + }, + { + "epoch": 40.0, + "learning_rate": 4.66410499328874e-06, + "loss": 1.3461, + "step": 970 + }, + { + "epoch": 40.0, + "eval_loss": 1.411194920539856, + "eval_runtime": 3.3078, + "eval_samples_per_second": 232.784, + "eval_steps_per_second": 7.558, + "step": 970 + }, + { + "epoch": 40.41, + "learning_rate": 4.279092830766471e-06, + "loss": 1.3545, + "step": 980 + }, + { + "epoch": 40.82, + "learning_rate": 3.90917368959989e-06, + "loss": 1.3419, + "step": 990 + }, + { + "epoch": 40.99, + "eval_loss": 1.4097695350646973, + "eval_runtime": 3.3186, + "eval_samples_per_second": 232.025, + "eval_steps_per_second": 7.533, + "step": 994 + }, + { + "epoch": 41.24, + "learning_rate": 3.5546170173306444e-06, + "loss": 1.3435, + "step": 1000 + }, + { + "epoch": 41.65, + "learning_rate": 3.215681071542867e-06, + "loss": 1.3453, + "step": 1010 + }, + { + "epoch": 41.98, + "eval_loss": 1.4083501100540161, + "eval_runtime": 3.3153, + "eval_samples_per_second": 232.254, + "eval_steps_per_second": 7.541, + "step": 1018 + }, + { + "epoch": 42.06, + "learning_rate": 2.892612731749414e-06, + "loss": 1.3573, + "step": 1020 + }, + { + "epoch": 42.47, + "learning_rate": 2.5856473195658897e-06, + "loss": 1.3615, + "step": 1030 + }, + { + "epoch": 42.89, + "learning_rate": 2.2950084273033634e-06, + "loss": 1.344, + "step": 1040 + }, + { + "epoch": 42.97, + "eval_loss": 1.4079786539077759, + "eval_runtime": 3.3319, + "eval_samples_per_second": 231.096, + "eval_steps_per_second": 7.503, + "step": 1042 + }, + { + "epoch": 43.3, + "learning_rate": 2.020907755104698e-06, + "loss": 1.3352, + "step": 1050 + }, + { + "epoch": 43.71, + "learning_rate": 1.7635449567430185e-06, + "loss": 1.3595, + "step": 1060 + }, + { + "epoch": 44.0, + "eval_loss": 1.4071862697601318, + "eval_runtime": 3.3215, + "eval_samples_per_second": 231.821, + "eval_steps_per_second": 7.527, + "step": 1067 + }, + { + "epoch": 44.12, + "learning_rate": 1.5231074941947781e-06, + "loss": 1.3352, + "step": 1070 + }, + { + "epoch": 44.54, + "learning_rate": 1.2997705010932393e-06, + "loss": 1.3463, + "step": 1080 + }, + { + "epoch": 44.95, + "learning_rate": 1.0936966551618604e-06, + "loss": 1.3566, + "step": 1090 + }, + { + "epoch": 44.99, + "eval_loss": 1.4066252708435059, + "eval_runtime": 3.3208, + "eval_samples_per_second": 231.869, + "eval_steps_per_second": 7.528, + "step": 1091 + }, + { + "epoch": 45.36, + "learning_rate": 9.050360597205515e-07, + "loss": 1.3309, + "step": 1100 + }, + { + "epoch": 45.77, + "learning_rate": 7.339261343510206e-07, + "loss": 1.3394, + "step": 1110 + }, + { + "epoch": 45.98, + "eval_loss": 1.406414270401001, + "eval_runtime": 3.3119, + "eval_samples_per_second": 232.494, + "eval_steps_per_second": 7.549, + "step": 1115 + }, + { + "epoch": 46.19, + "learning_rate": 5.804915148009571e-07, + "loss": 1.3552, + "step": 1120 + }, + { + "epoch": 46.6, + "learning_rate": 4.4484396219986735e-07, + "loss": 1.3409, + "step": 1130 + }, + { + "epoch": 46.97, + "eval_loss": 1.4059563875198364, + "eval_runtime": 3.3174, + "eval_samples_per_second": 232.112, + "eval_steps_per_second": 7.536, + "step": 1139 + }, + { + "epoch": 47.01, + "learning_rate": 3.270822816527325e-07, + "loss": 1.3432, + "step": 1140 + }, + { + "epoch": 47.42, + "learning_rate": 2.272922502707997e-07, + "loss": 1.3395, + "step": 1150 + }, + { + "epoch": 47.84, + "learning_rate": 1.4554655469189439e-07, + "loss": 1.3263, + "step": 1160 + }, + { + "epoch": 48.0, + "eval_loss": 1.4058961868286133, + "eval_runtime": 3.3133, + "eval_samples_per_second": 232.395, + "eval_steps_per_second": 7.545, + "step": 1164 + }, + { + "epoch": 48.25, + "learning_rate": 8.190473813576572e-08, + "loss": 1.3607, + "step": 1170 + }, + { + "epoch": 48.66, + "learning_rate": 3.6413157033077236e-08, + "loss": 1.3466, + "step": 1180 + }, + { + "epoch": 48.99, + "eval_loss": 1.4058654308319092, + "eval_runtime": 3.3208, + "eval_samples_per_second": 231.87, + "eval_steps_per_second": 7.528, + "step": 1188 + }, + { + "epoch": 49.07, + "learning_rate": 9.104947259561126e-09, + "loss": 1.3313, + "step": 1190 + }, + { + "epoch": 49.48, + "learning_rate": 0.0, + "loss": 1.3469, + "step": 1200 + }, + { + "epoch": 49.48, + "eval_loss": 1.4058780670166016, + "eval_runtime": 3.3096, + "eval_samples_per_second": 232.657, + "eval_steps_per_second": 7.554, + "step": 1200 + }, + { + "epoch": 49.48, + "step": 1200, + "total_flos": 7.868691621724815e+17, + "train_loss": 1.4999438818295796, + "train_runtime": 2257.7963, + "train_samples_per_second": 68.164, + "train_steps_per_second": 0.531 + } + ], + "max_steps": 1200, + "num_train_epochs": 50, + "total_flos": 7.868691621724815e+17, + "trial_name": null, + "trial_params": null +} diff --git a/llama2_wenan_qlora_50e/training_args.bin b/llama2_wenan_qlora_50e/training_args.bin new file mode 100644 index 0000000..b5b21b9 --- /dev/null +++ b/llama2_wenan_qlora_50e/training_args.bin @@ -0,0 +1,121 @@ +{ + "output_dir": "test/ailabmodel/my_llama2_wenan_qlora_50e", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": true, + "do_predict": false, + "evaluation_strategy": "epoch", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "learning_rate": 5e-05, + "weight_decay": 0, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 50, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "warmup_ratio": 0.03, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "test/ailabmodel/my_llama2_wenan_qlora_50e/runs/Jan26_12-05-51_hu-ailab-10-101-3-63.atp.cn", + "logging_strategy": "steps", + "logging_first_step": false, + "logging_steps": 10, + "logging_nan_inf_filter": true, + "save_strategy": "no", + "save_steps": 500, + "save_total_limit": 3, + "save_safetensors": false, + "save_on_each_node": false, + "no_cuda": false, + "use_mps_device": false, + "seed": 42, + "data_seed": null, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": false, + "fp16": true, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": [], + "dataloader_drop_last": false, + "eval_steps": 250, + "dataloader_num_workers": 0, + "past_index": -1, + "run_name": "test/ailabmodel/my_llama2_wenan_qlora_50e", + "disable_tqdm": false, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": null, + "greater_is_better": null, + "ignore_data_skip": false, + "sharded_ddp": [], + "fsdp": [], + "fsdp_min_num_params": 0, + "fsdp_config": { + "fsdp_min_num_params": 0, + "xla": false, + "xla_fsdp_grad_ckpt": false + }, + "fsdp_transformer_layer_cls_to_wrap": null, + "deepspeed": "/data1/cgzhang6/ailab_sdk/src/ailab/atp_finetuner/trainer/nlp/ds_zero2_no_offload.json", + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": false, + "ddp_bucket_cap_mb": null, + "dataloader_pin_memory": true, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": true, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_token": "", + "hub_private_repo": false, + "gradient_checkpointing": false, + "include_inputs_for_metrics": false, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": "", + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 30000, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "xpu_backend": null, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": 512, + "generation_num_beams": null, + "generation_config": null +} diff --git a/llama2_wenan_qlora_50e/training_eval_loss.png b/llama2_wenan_qlora_50e/training_eval_loss.png new file mode 100644 index 0000000..9107cb5 Binary files /dev/null and b/llama2_wenan_qlora_50e/training_eval_loss.png differ diff --git a/llama2_wenan_qlora_50e/training_loss.png b/llama2_wenan_qlora_50e/training_loss.png new file mode 100644 index 0000000..6250506 Binary files /dev/null and b/llama2_wenan_qlora_50e/training_loss.png differ