Changes for fast tokenizer (#5)
- Add tokenizer.json (dcca07232bfb1028e499333730f868b87fd3d043) - Update unknown token (8852c40b30c9b7b981faf4fa77167fd862fd5fdb) - Move <|endoftext|> from added_tokens.json to vocab.json (3e9581879a6134abfb58f5788096027dd1756a63) Co-authored-by: Jonatan Kłosko <jonatanklosko@users.noreply.huggingface.co>
This commit is contained in:
parent
bfeb067956
commit
a1790163fb
|
@ -17,7 +17,6 @@
|
||||||
"<|da|>": 50285,
|
"<|da|>": 50285,
|
||||||
"<|de|>": 50261,
|
"<|de|>": 50261,
|
||||||
"<|el|>": 50281,
|
"<|el|>": 50281,
|
||||||
"<|endoftext|>": 50257,
|
|
||||||
"<|en|>": 50259,
|
"<|en|>": 50259,
|
||||||
"<|es|>": 50262,
|
"<|es|>": 50262,
|
||||||
"<|et|>": 50307,
|
"<|et|>": 50307,
|
||||||
|
|
|
@ -124,7 +124,7 @@
|
||||||
},
|
},
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
"unk_token": {
|
"unk_token": {
|
||||||
"content": "",
|
"content": "<|endoftext|>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": true,
|
"normalized": true,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -27,7 +27,7 @@
|
||||||
"tokenizer_class": "WhisperTokenizer",
|
"tokenizer_class": "WhisperTokenizer",
|
||||||
"unk_token": {
|
"unk_token": {
|
||||||
"__type": "AddedToken",
|
"__type": "AddedToken",
|
||||||
"content": "",
|
"content": "<|endoftext|>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": true,
|
"normalized": true,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
{
|
{
|
||||||
"": 50256,
|
"": 50256,
|
||||||
|
"<|endoftext|>": 50257,
|
||||||
"!": 0,
|
"!": 0,
|
||||||
"!!": 1432,
|
"!!": 1432,
|
||||||
"!!!": 4589,
|
"!!!": 4589,
|
||||||
|
|
Loading…
Reference in New Issue