Upload processor

This commit is contained in:
Arthur Zucker 2022-09-26 06:50:41 +00:00 committed by huggingface-web
parent 3b5167921f
commit 25da2ff16c
6 changed files with 116779 additions and 0 deletions

108
added_tokens.json Normal file
View File

@ -0,0 +1,108 @@
{
"<|af|>": 50326,
"<|am|>": 50333,
"<|ar|>": 50271,
"<|as|>": 50349,
"<|az|>": 50303,
"<|ba|>": 50354,
"<|be|>": 50329,
"<|bg|>": 50291,
"<|bn|>": 50301,
"<|bo|>": 50346,
"<|br|>": 50308,
"<|bs|>": 50314,
"<|ca|>": 50269,
"<|cs|>": 50282,
"<|cy|>": 50296,
"<|da|>": 50284,
"<|de|>": 50260,
"<|el|>": 50280,
"<|en|>": 50258,
"<|es|>": 50261,
"<|et|>": 50306,
"<|eu|>": 50309,
"<|fa|>": 50299,
"<|fi|>": 50276,
"<|fo|>": 50337,
"<|fr|>": 50264,
"<|gl|>": 50318,
"<|gu|>": 50332,
"<|haw|>": 50351,
"<|ha|>": 50353,
"<|hi|>": 50275,
"<|hr|>": 50290,
"<|ht|>": 50338,
"<|hu|>": 50285,
"<|hy|>": 50311,
"<|id|>": 50274,
"<|is|>": 50310,
"<|it|>": 50273,
"<|iw|>": 50278,
"<|ja|>": 50265,
"<|jw|>": 50355,
"<|ka|>": 50328,
"<|kk|>": 50315,
"<|km|>": 50322,
"<|kn|>": 50305,
"<|ko|>": 50263,
"<|la|>": 50293,
"<|lb|>": 50344,
"<|ln|>": 50352,
"<|lo|>": 50335,
"<|lt|>": 50292,
"<|lv|>": 50300,
"<|mg|>": 50348,
"<|mi|>": 50294,
"<|mk|>": 50307,
"<|ml|>": 50295,
"<|mn|>": 50313,
"<|mr|>": 50319,
"<|ms|>": 50281,
"<|mt|>": 50342,
"<|my|>": 50345,
"<|ne|>": 50312,
"<|nl|>": 50270,
"<|nn|>": 50341,
"<|nocaptions|>": 50361,
"<|notimestamps|>": 50362,
"<|no|>": 50287,
"<|oc|>": 50327,
"<|pa|>": 50320,
"<|pl|>": 50268,
"<|ps|>": 50339,
"<|pt|>": 50266,
"<|ro|>": 50283,
"<|ru|>": 50262,
"<|sa|>": 50343,
"<|sd|>": 50331,
"<|si|>": 50321,
"<|sk|>": 50297,
"<|sl|>": 50304,
"<|sn|>": 50323,
"<|so|>": 50325,
"<|sq|>": 50316,
"<|sr|>": 50302,
"<|startoflm|>": 50359,
"<|startofprev|>": 50360,
"<|startoftranscript|>": 50257,
"<|su|>": 50356,
"<|sv|>": 50272,
"<|sw|>": 50317,
"<|ta|>": 50286,
"<|te|>": 50298,
"<|tg|>": 50330,
"<|th|>": 50288,
"<|tk|>": 50340,
"<|tl|>": 50347,
"<|transcribe|>": 50358,
"<|translate|>": 50357,
"<|tr|>": 50267,
"<|tt|>": 50350,
"<|uk|>": 50279,
"<|ur|>": 50289,
"<|uz|>": 50336,
"<|vi|>": 50277,
"<|yi|>": 50334,
"<|yo|>": 50324,
"<|zh|>": 50259
}

50001
merges.txt Normal file

File diff suppressed because it is too large Load Diff

16257
preprocessor_config.json Normal file

File diff suppressed because it is too large Load Diff

119
special_tokens_map.json Normal file
View File

@ -0,0 +1,119 @@
{
"additional_special_tokens": [
"<|startoftranscript|>",
"<|en|>",
"<|zh|>",
"<|de|>",
"<|es|>",
"<|ru|>",
"<|ko|>",
"<|fr|>",
"<|ja|>",
"<|pt|>",
"<|tr|>",
"<|pl|>",
"<|ca|>",
"<|nl|>",
"<|ar|>",
"<|sv|>",
"<|it|>",
"<|id|>",
"<|hi|>",
"<|fi|>",
"<|vi|>",
"<|iw|>",
"<|uk|>",
"<|el|>",
"<|ms|>",
"<|cs|>",
"<|ro|>",
"<|da|>",
"<|hu|>",
"<|ta|>",
"<|no|>",
"<|th|>",
"<|ur|>",
"<|hr|>",
"<|bg|>",
"<|lt|>",
"<|la|>",
"<|mi|>",
"<|ml|>",
"<|cy|>",
"<|sk|>",
"<|te|>",
"<|fa|>",
"<|lv|>",
"<|bn|>",
"<|sr|>",
"<|az|>",
"<|sl|>",
"<|kn|>",
"<|et|>",
"<|mk|>",
"<|br|>",
"<|eu|>",
"<|is|>",
"<|hy|>",
"<|ne|>",
"<|mn|>",
"<|bs|>",
"<|kk|>",
"<|sq|>",
"<|sw|>",
"<|gl|>",
"<|mr|>",
"<|pa|>",
"<|si|>",
"<|km|>",
"<|sn|>",
"<|yo|>",
"<|so|>",
"<|af|>",
"<|oc|>",
"<|ka|>",
"<|be|>",
"<|tg|>",
"<|sd|>",
"<|gu|>",
"<|am|>",
"<|yi|>",
"<|lo|>",
"<|uz|>",
"<|fo|>",
"<|ht|>",
"<|ps|>",
"<|tk|>",
"<|nn|>",
"<|mt|>",
"<|sa|>",
"<|lb|>",
"<|my|>",
"<|bo|>",
"<|tl|>",
"<|mg|>",
"<|as|>",
"<|tt|>",
"<|haw|>",
"<|ln|>",
"<|ha|>",
"<|ba|>",
"<|jw|>",
"<|su|>",
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nocaptions|>",
"<|notimestamps|>"
],
"bos_token": "<|endoftext|>",
"eos_token": "<|endoftext|>",
"unk_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
}
}

35
tokenizer_config.json Normal file
View File

@ -0,0 +1,35 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"bos_token": {
"__type": "AddedToken",
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"eos_token": {
"__type": "AddedToken",
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"errors": "replace",
"model_max_length": 1024,
"name_or_path": "ArthurZ/whisper-small.en",
"pad_token": null,
"processor_class": "WhisperProcessor",
"special_tokens_map_file": null,
"tokenizer_class": "WhisperTokenizer",
"unk_token": {
"__type": "AddedToken",
"content": "<|endoftext|>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
}
}

50259
vocab.json Normal file

File diff suppressed because it is too large Load Diff