Changes for fast tokenizer (#5)

- Add tokenizer.json (dcca07232bfb1028e499333730f868b87fd3d043) - Update unknown token (8852c40b30c9b7b981faf4fa77167fd862fd5fdb) - Move <|endoftext|> from added_tokens.json to vocab.json (3e9581879a6134abfb58f5788096027dd1756a63) Co-authored-by: Jonatan Kłosko <jonatanklosko@users.noreply.huggingface.co>
2023-01-25 10:17:42 +00:00 · 2023-01-25 10:17:42 +00:00 · a1790163fb
parent bfeb067956
commit a1790163fb
5 changed files with 101345 additions and 3 deletions
--- a/added_tokens.json
+++ b/added_tokens.json
@ -17,7 +17,6 @@
  "<|da|>": 50285,
  "<|de|>": 50261,
  "<|el|>": 50281,
  "<|endoftext|>": 50257,
  "<|en|>": 50259,
  "<|es|>": 50262,
  "<|et|>": 50307,
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -124,7 +124,7 @@
  },
  "pad_token": "<|endoftext|>",
  "unk_token": {
-    "content": "",
+    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -27,7 +27,7 @@
  "tokenizer_class": "WhisperTokenizer",
  "unk_token": {
    "__type": "AddedToken",
-    "content": "",
+    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
--- a/vocab.json
+++ b/vocab.json
@ -1,5 +1,6 @@
 {
  "": 50256,
  "<|endoftext|>": 50257,
  "!": 0,
  "!!": 1432,
  "!!!": 4589,