Upload tokenizer (#26)

- Upload tokenizer (33482313ea52a0bc9ee1303ac23d3f2d36a90932)
2023-02-23 17:55:25 +00:00 · 2023-02-23 17:55:25 +00:00 · 5c6a0f32a1
parent 5ff64998d3
commit 5c6a0f32a1
6 changed files with 101346 additions and 50264 deletions
--- a/added_tokens.json
+++ b/added_tokens.json
@ -17,7 +17,6 @@
  "<|da|>": 50285,
  "<|de|>": 50261,
  "<|el|>": 50281,
  "<|endoftext|>": 50257,
  "<|en|>": 50259,
  "<|es|>": 50262,
  "<|et|>": 50307,
--- a/merges.txt
+++ b/merges.txt
@ -1,4 +1,4 @@
-#version: 0.2
+#version: 0.2 - Trained by `huggingface/tokenizers`
 Ġ a
 Ġt h
 i n
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -124,7 +124,7 @@
  },
  "pad_token": "<|endoftext|>",
  "unk_token": {
-    "content": "",
+    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -19,7 +19,6 @@
  },
  "errors": "replace",
  "model_max_length": 1024,
  "name_or_path": "openai/whisper-large",
  "pad_token": null,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
@ -27,7 +26,7 @@
  "tokenizer_class": "WhisperTokenizer",
  "unk_token": {
    "__type": "AddedToken",
-    "content": "",
+    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
--- a/vocab.json
+++ b/vocab.json