diff --git a/README.md b/README.md new file mode 100644 index 0000000..97ea6d5 --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +--- +language: ja +thumbnail: https://github.com/rinnakk/japanese-pretrained-models/blob/master/rinna.png +tags: +- ja +- japanese +- gpt +- text-generation +- lm +- nlp +license: mit +datasets: +- cc100 +- wikipedia +widget: +- text: "西田幾多郎は、" +--- + +# japanese-gpt-1b + +![rinna-icon](./rinna.png) + +This repository provides a 1.3B-parameter Japanese GPT model. The model was trained by [rinna Co., Ltd.](https://corp.rinna.co.jp/) + +# How to use the model + +*NOTE:* Use `T5Tokenizer` to initiate the tokenizer. + +~~~~ +import torch +from transformers import T5Tokenizer, AutoModelForCausalLM + +tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt-1b") +model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b") + +if torch.cuda.is_available(): + model = model.to("cuda") + +text = "西田幾多郎は、" +token_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt") + +with torch.no_grad(): + output_ids = model.generate( + token_ids.to(model.device), + max_length=100, + min_length=100, + do_sample=True, + top_k=500, + top_p=0.95, + pad_token_id=tokenizer.pad_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + bad_word_ids=[[tokenizer.unk_token_id]] + ) + +output = tokenizer.decode(output_ids.tolist()[0]) +print(output) +~~~~ + +# Model architecture +A 24-layer, 2048-hidden-size transformer-based language model. + +# Training +The model was trained on [Japanese C4](https://huggingface.co/datasets/allenai/c4), [Japanese CC-100](http://data.statmt.org/cc-100/ja.txt.xz) and [Japanese Wikipedia](https://dumps.wikimedia.org/other/cirrussearch) to optimize a traditional language modelling objective. It reaches around 14 perplexity on a chosen validation set from the same data. +# Tokenization +The model uses a [sentencepiece](https://github.com/google/sentencepiece)-based tokenizer. The vocabulary was first trained on a selected subset from the training data using the official sentencepiece training script, and then augmented with emojis and symbols. +# Licenese +[The MIT license](https://opensource.org/licenses/MIT) diff --git a/config.json b/config.json new file mode 100644 index 0000000..1c57d3d --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "activation_function": "gelu_fast", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 2, + "embd_pdrop": 0.1, + "eos_token_id": 3, + "gradient_checkpointing": false, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_ctx": 1024, + "n_embd": 2048, + "n_head": 16, + "n_inner": 8192, + "n_layer": 24, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "use_cache": true, + "vocab_size": 44928 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..b0dd7a5 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a4d618d4665790bc0fd941326f8fbd27fa1f5eebbb406c4000dda34653fcab +size 2655859801 diff --git a/rinna.png b/rinna.png new file mode 100755 index 0000000..d0f007e Binary files /dev/null and b/rinna.png differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..6d5b058 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"} \ No newline at end of file diff --git a/spiece.model b/spiece.model new file mode 100644 index 0000000..0281b09 --- /dev/null +++ b/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dbbd4ddbe43941051ed35fd44ff0d9d1c00ed345f7fd4d1969df174110f0609 +size 1044749 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..f16e5fc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"eos_token": "", "unk_token": "", "pad_token": "[PAD]", "extra_ids": 0, "additional_special_tokens": [], "sp_model_kwargs": {}, "bos_token": "", "cls_token": "[CLS]", "sep_token": "[SEP]", "mask_token": "[MASK]", "do_lower_case": false, "tokenizer_class": "T5Tokenizer"} \ No newline at end of file