Changes for fast tokenizer (#5)

- Add tokenizer.json (dcca07232bfb1028e499333730f868b87fd3d043)
- Update unknown token (8852c40b30c9b7b981faf4fa77167fd862fd5fdb)
- Move <|endoftext|> from added_tokens.json to vocab.json (3e9581879a6134abfb58f5788096027dd1756a63)


Co-authored-by: Jonatan Kłosko <jonatanklosko@users.noreply.huggingface.co>
This commit is contained in:
Arthur Zucker 2023-01-25 10:17:42 +00:00 committed by system
parent bfeb067956
commit a1790163fb
5 changed files with 101345 additions and 3 deletions

View File

@ -17,7 +17,6 @@
"<|da|>": 50285,
"<|de|>": 50261,
"<|el|>": 50281,
"<|endoftext|>": 50257,
"<|en|>": 50259,
"<|es|>": 50262,
"<|et|>": 50307,

View File

@ -124,7 +124,7 @@
},
"pad_token": "<|endoftext|>",
"unk_token": {
"content": "",
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,

101342
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@
"tokenizer_class": "WhisperTokenizer",
"unk_token": {
"__type": "AddedToken",
"content": "",
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,

View File

@ -1,5 +1,6 @@
{
"": 50256,
"<|endoftext|>": 50257,
"!": 0,
"!!": 1432,
"!!!": 4589,