Changes for fast tokenizer (#5)
- Add tokenizer.json (dcca07232bfb1028e499333730f868b87fd3d043) - Update unknown token (8852c40b30c9b7b981faf4fa77167fd862fd5fdb) - Move <|endoftext|> from added_tokens.json to vocab.json (3e9581879a6134abfb58f5788096027dd1756a63) Co-authored-by: Jonatan Kłosko <jonatanklosko@users.noreply.huggingface.co>
This commit is contained in:
parent
bfeb067956
commit
a1790163fb
|
@ -17,7 +17,6 @@
|
|||
"<|da|>": 50285,
|
||||
"<|de|>": 50261,
|
||||
"<|el|>": 50281,
|
||||
"<|endoftext|>": 50257,
|
||||
"<|en|>": 50259,
|
||||
"<|es|>": 50262,
|
||||
"<|et|>": 50307,
|
||||
|
|
|
@ -124,7 +124,7 @@
|
|||
},
|
||||
"pad_token": "<|endoftext|>",
|
||||
"unk_token": {
|
||||
"content": "",
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -27,7 +27,7 @@
|
|||
"tokenizer_class": "WhisperTokenizer",
|
||||
"unk_token": {
|
||||
"__type": "AddedToken",
|
||||
"content": "",
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"": 50256,
|
||||
"<|endoftext|>": 50257,
|
||||
"!": 0,
|
||||
"!!": 1432,
|
||||
"!!!": 4589,
|
||||
|
|
Loading…
Reference in New Issue