Compare commits
10 Commits
3e1c92ca77
...
a0e5677e86
Author | SHA1 | Date |
---|---|---|
|
a0e5677e86 | |
|
51568a6e0a | |
|
5e755b1c9d | |
|
6f231487a5 | |
|
88f8889ee0 | |
|
0b8087bb43 | |
|
b41a392439 | |
|
1172dffaf8 | |
|
df3bd66031 | |
|
9b4ecbcecd |
|
@ -15,3 +15,4 @@
|
|||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
||||
rust_model.ot filter=lfs diff=lfs merge=lfs -text
|
||||
|
|
26
README.md
26
README.md
|
@ -4,11 +4,10 @@ language:
|
|||
tags:
|
||||
- text generation
|
||||
- pytorch
|
||||
- the Pile
|
||||
- causal-lm
|
||||
license: apache-2.0
|
||||
license: mit
|
||||
datasets:
|
||||
- the Pile
|
||||
- the_pile
|
||||
---
|
||||
|
||||
# GPT-Neo 2.7B
|
||||
|
@ -23,7 +22,7 @@ GPT-Neo 2.7B was trained on the Pile, a large scale curated dataset created by E
|
|||
|
||||
## Training procedure
|
||||
|
||||
This model was trained for 400,000 steps on the Pile. It was trained as a masked autoregressive language model, using cross-entropy loss.
|
||||
This model was trained for 420 billion tokens over 400,000 steps. It was trained as a masked autoregressive language model, using cross-entropy loss.
|
||||
|
||||
## Intended Use and Limitations
|
||||
|
||||
|
@ -77,7 +76,26 @@ TBD
|
|||
|
||||
### BibTeX entry and citation info
|
||||
|
||||
To cite this model, use
|
||||
```bibtex
|
||||
@software{gpt-neo,
|
||||
author = {Black, Sid and
|
||||
Leo, Gao and
|
||||
Wang, Phil and
|
||||
Leahy, Connor and
|
||||
Biderman, Stella},
|
||||
title = {{GPT-Neo: Large Scale Autoregressive Language
|
||||
Modeling with Mesh-Tensorflow}},
|
||||
month = mar,
|
||||
year = 2021,
|
||||
note = {{If you use this software, please cite it using
|
||||
these metadata.}},
|
||||
publisher = {Zenodo},
|
||||
version = {1.0},
|
||||
doi = {10.5281/zenodo.5297715},
|
||||
url = {https://doi.org/10.5281/zenodo.5297715}
|
||||
}
|
||||
|
||||
@article{gao2020pile,
|
||||
title={The Pile: An 800GB Dataset of Diverse Text for Language Modeling},
|
||||
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and others},
|
||||
|
|
16
config.json
16
config.json
|
@ -65,16 +65,16 @@
|
|||
"summary_proj_to_labels": true,
|
||||
"summary_type": "cls_index",
|
||||
"summary_use_proj": true,
|
||||
"transformers_version": "4.5.0.dev0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 50257,
|
||||
"window_size": 256,
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"task_specific_params": {
|
||||
"text-generation": {
|
||||
"do_sample": true,
|
||||
"temperature": 0.9,
|
||||
"max_length": 50
|
||||
"max_length": 50,
|
||||
"temperature": 0.9
|
||||
}
|
||||
}
|
||||
},
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"transformers_version": "4.9.0.dev0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 50257,
|
||||
"window_size": 256
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue