diff --git a/added_tokens.json b/added_tokens.json index 836c38f..47e9dd3 100644 --- a/added_tokens.json +++ b/added_tokens.json @@ -1,108 +1,109 @@ { - "<|af|>": 50326, - "<|am|>": 50333, - "<|ar|>": 50271, - "<|as|>": 50349, - "<|az|>": 50303, - "<|ba|>": 50354, - "<|be|>": 50329, - "<|bg|>": 50291, - "<|bn|>": 50301, - "<|bo|>": 50346, - "<|br|>": 50308, - "<|bs|>": 50314, - "<|ca|>": 50269, - "<|cs|>": 50282, - "<|cy|>": 50296, - "<|da|>": 50284, - "<|de|>": 50260, - "<|el|>": 50280, - "<|en|>": 50258, - "<|es|>": 50261, - "<|et|>": 50306, - "<|eu|>": 50309, - "<|fa|>": 50299, - "<|fi|>": 50276, - "<|fo|>": 50337, - "<|fr|>": 50264, - "<|gl|>": 50318, - "<|gu|>": 50332, - "<|haw|>": 50351, - "<|ha|>": 50353, - "<|hi|>": 50275, - "<|hr|>": 50290, - "<|ht|>": 50338, - "<|hu|>": 50285, - "<|hy|>": 50311, - "<|id|>": 50274, - "<|is|>": 50310, - "<|it|>": 50273, - "<|iw|>": 50278, - "<|ja|>": 50265, - "<|jw|>": 50355, - "<|ka|>": 50328, - "<|kk|>": 50315, - "<|km|>": 50322, - "<|kn|>": 50305, - "<|ko|>": 50263, - "<|la|>": 50293, - "<|lb|>": 50344, - "<|ln|>": 50352, - "<|lo|>": 50335, - "<|lt|>": 50292, - "<|lv|>": 50300, - "<|mg|>": 50348, - "<|mi|>": 50294, - "<|mk|>": 50307, - "<|ml|>": 50295, - "<|mn|>": 50313, - "<|mr|>": 50319, - "<|ms|>": 50281, - "<|mt|>": 50342, - "<|my|>": 50345, - "<|ne|>": 50312, - "<|nl|>": 50270, - "<|nn|>": 50341, - "<|nocaptions|>": 50361, - "<|notimestamps|>": 50362, - "<|no|>": 50287, - "<|oc|>": 50327, - "<|pa|>": 50320, - "<|pl|>": 50268, - "<|ps|>": 50339, - "<|pt|>": 50266, - "<|ro|>": 50283, - "<|ru|>": 50262, - "<|sa|>": 50343, - "<|sd|>": 50331, - "<|si|>": 50321, - "<|sk|>": 50297, - "<|sl|>": 50304, - "<|sn|>": 50323, - "<|so|>": 50325, - "<|sq|>": 50316, - "<|sr|>": 50302, - "<|startoflm|>": 50359, - "<|startofprev|>": 50360, - "<|startoftranscript|>": 50257, - "<|su|>": 50356, - "<|sv|>": 50272, - "<|sw|>": 50317, - "<|ta|>": 50286, - "<|te|>": 50298, - "<|tg|>": 50330, - "<|th|>": 50288, - "<|tk|>": 50340, - "<|tl|>": 50347, - "<|transcribe|>": 50358, - "<|translate|>": 50357, - "<|tr|>": 50267, - "<|tt|>": 50350, - "<|uk|>": 50279, - "<|ur|>": 50289, - "<|uz|>": 50336, - "<|vi|>": 50277, - "<|yi|>": 50334, - "<|yo|>": 50324, - "<|zh|>": 50259 + "<|af|>": 50327, + "<|am|>": 50334, + "<|ar|>": 50272, + "<|as|>": 50350, + "<|az|>": 50304, + "<|ba|>": 50355, + "<|be|>": 50330, + "<|bg|>": 50292, + "<|bn|>": 50302, + "<|bo|>": 50347, + "<|br|>": 50309, + "<|bs|>": 50315, + "<|ca|>": 50270, + "<|cs|>": 50283, + "<|cy|>": 50297, + "<|da|>": 50285, + "<|de|>": 50261, + "<|el|>": 50281, + "<|endoftext|>": 50257, + "<|en|>": 50259, + "<|es|>": 50262, + "<|et|>": 50307, + "<|eu|>": 50310, + "<|fa|>": 50300, + "<|fi|>": 50277, + "<|fo|>": 50338, + "<|fr|>": 50265, + "<|gl|>": 50319, + "<|gu|>": 50333, + "<|haw|>": 50352, + "<|ha|>": 50354, + "<|hi|>": 50276, + "<|hr|>": 50291, + "<|ht|>": 50339, + "<|hu|>": 50286, + "<|hy|>": 50312, + "<|id|>": 50275, + "<|is|>": 50311, + "<|it|>": 50274, + "<|iw|>": 50279, + "<|ja|>": 50266, + "<|jw|>": 50356, + "<|ka|>": 50329, + "<|kk|>": 50316, + "<|km|>": 50323, + "<|kn|>": 50306, + "<|ko|>": 50264, + "<|la|>": 50294, + "<|lb|>": 50345, + "<|ln|>": 50353, + "<|lo|>": 50336, + "<|lt|>": 50293, + "<|lv|>": 50301, + "<|mg|>": 50349, + "<|mi|>": 50295, + "<|mk|>": 50308, + "<|ml|>": 50296, + "<|mn|>": 50314, + "<|mr|>": 50320, + "<|ms|>": 50282, + "<|mt|>": 50343, + "<|my|>": 50346, + "<|ne|>": 50313, + "<|nl|>": 50271, + "<|nn|>": 50342, + "<|nocaptions|>": 50362, + "<|notimestamps|>": 50363, + "<|no|>": 50288, + "<|oc|>": 50328, + "<|pa|>": 50321, + "<|pl|>": 50269, + "<|ps|>": 50340, + "<|pt|>": 50267, + "<|ro|>": 50284, + "<|ru|>": 50263, + "<|sa|>": 50344, + "<|sd|>": 50332, + "<|si|>": 50322, + "<|sk|>": 50298, + "<|sl|>": 50305, + "<|sn|>": 50324, + "<|so|>": 50326, + "<|sq|>": 50317, + "<|sr|>": 50303, + "<|startoflm|>": 50360, + "<|startofprev|>": 50361, + "<|startoftranscript|>": 50258, + "<|su|>": 50357, + "<|sv|>": 50273, + "<|sw|>": 50318, + "<|ta|>": 50287, + "<|te|>": 50299, + "<|tg|>": 50331, + "<|th|>": 50289, + "<|tk|>": 50341, + "<|tl|>": 50348, + "<|transcribe|>": 50359, + "<|translate|>": 50358, + "<|tr|>": 50268, + "<|tt|>": 50351, + "<|uk|>": 50280, + "<|ur|>": 50290, + "<|uz|>": 50337, + "<|vi|>": 50278, + "<|yi|>": 50335, + "<|yo|>": 50325, + "<|zh|>": 50260 } diff --git a/special_tokens_map.json b/special_tokens_map.json index 479a05b..6d75099 100644 --- a/special_tokens_map.json +++ b/special_tokens_map.json @@ -1,5 +1,6 @@ { "additional_special_tokens": [ + "<|endoftext|>", "<|startoftranscript|>", "<|en|>", "<|zh|>", @@ -107,13 +108,25 @@ "<|nocaptions|>", "<|notimestamps|>" ], - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "unk_token": { + "bos_token": { "content": "<|endoftext|>", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false } } diff --git a/tokenizer_config.json b/tokenizer_config.json index 8c171b1..2c84952 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -19,14 +19,13 @@ }, "errors": "replace", "model_max_length": 1024, - "name_or_path": "openai/whisper-tiny", + "name_or_path": "whisper-multi/test_added_eot", "pad_token": null, - "processor_class": "WhisperProcessor", "special_tokens_map_file": null, "tokenizer_class": "WhisperTokenizer", "unk_token": { "__type": "AddedToken", - "content": "<|endoftext|>", + "content": "", "lstrip": false, "normalized": true, "rstrip": false,