diff --git a/tokenizer.json b/tokenizer.json index b063159..760bdbc 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -1,152 +1,1737 @@ { - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": [ - { - "id": 0, - "special": true, - "content": "<|startoftext|>", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": true + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "special": true, + "content": "<|startoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true + }, + { + "id": 1, + "special": true, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFC" + }, + { + "type": "Replace", + "pattern": { + "Regex": "\\s+" }, - { - "id": 1, - "special": true, - "content": "<|endoftext|>", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false - } - ], - "normalizer": null, - "pre_tokenizer": { + "content": " " + }, + { + "type": "Lowercase" + } + ] + }, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+" + }, + "behavior": "Removed", + "invert": true + }, + { "type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true + } + ] + }, + "post_processor": { + "type": "RobertaProcessing", + "sep": [ + "<|endoftext|>", + 1 + ], + "cls": [ + "<|startoftext|>", + 0 + ], + "trim_offsets": false, + "add_prefix_space": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "<|endoftext|>", + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "vocab": { + "<|startoftext|>": 0, + "<|endoftext|>": 1, + "!": 2, + "\"": 3, + "#": 4, + "$": 5, + "%": 6, + "&": 7, + "'": 8, + "(": 9, + ")": 10, + "*": 11, + "+": 12, + ",": 13, + "-": 14, + ".": 15, + "/": 16, + "0": 17, + "1": 18, + "2": 19, + "3": 20, + "4": 21, + "5": 22, + "6": 23, + "7": 24, + "8": 25, + "9": 26, + ":": 27, + ";": 28, + "<": 29, + "=": 30, + ">": 31, + "?": 32, + "@": 33, + "A": 34, + "B": 35, + "C": 36, + "D": 37, + "E": 38, + "F": 39, + "G": 40, + "H": 41, + "I": 42, + "J": 43, + "K": 44, + "L": 45, + "M": 46, + "N": 47, + "O": 48, + "P": 49, + "Q": 50, + "R": 51, + "S": 52, + "T": 53, + "U": 54, + "V": 55, + "W": 56, + "X": 57, + "Y": 58, + "Z": 59, + "[": 60, + "\\": 61, + "]": 62, + "^": 63, + "_": 64, + "`": 65, + "a": 66, + "b": 67, + "c": 68, + "d": 69, + "e": 70, + "f": 71, + "g": 72, + "h": 73, + "i": 74, + "j": 75, + "k": 76, + "l": 77, + "m": 78, + "n": 79, + "o": 80, + "p": 81, + "q": 82, + "r": 83, + "s": 84, + "t": 85, + "u": 86, + "v": 87, + "w": 88, + "x": 89, + "y": 90, + "z": 91, + "|": 92, + "}": 93, + "~": 94, + "¡": 95, + "¢": 96, + "£": 97, + "¤": 98, + "¥": 99, + "¦": 100, + "§": 101, + "¨": 102, + "©": 103, + "ª": 104, + "«": 105, + "¬": 106, + "®": 107, + "¯": 108, + "°": 109, + "±": 110, + "²": 111, + "³": 112, + "´": 113, + "µ": 114, + "¶": 115, + "·": 116, + "¸": 117, + "¹": 118, + "º": 119, + "»": 120, + "¼": 121, + "½": 122, + "¾": 123, + "¿": 124, + "Â": 125, + "Ã": 126, + "Ä": 127, + "Å": 128, + "Æ": 129, + "Ç": 130, + "È": 131, + "É": 132, + "Ê": 133, + "Ë": 134, + "Ì": 135, + "Í": 136, + "Î": 137, + "Ï": 138, + "Ð": 139, + "Ñ": 140, + "Ö": 141, + "×": 142, + "Ø": 143, + "Ù": 144, + "Ü": 145, + "à": 146, + "á": 147, + "â": 148, + "ã": 149, + "ä": 150, + "å": 151, + "æ": 152, + "ç": 153, + "è": 154, + "é": 155, + "ë": 156, + "ì": 157, + "ï": 158, + "Ċ": 159, + "Ġ": 160, + "Ģ": 161, + "ģ": 162, + "Ĥ": 163, + "ĥ": 164, + "Ħ": 165, + "ħ": 166, + "Ĩ": 167, + "ĩ": 168, + "Ī": 169, + "ī": 170, + "Ĭ": 171, + "ĭ": 172, + "Į": 173, + "į": 174, + "İ": 175, + "ı": 176, + "IJ": 177, + "ij": 178, + "Ĵ": 179, + "ĵ": 180, + "Ķ": 181, + "ķ": 182, + "ĸ": 183, + "Ĺ": 184, + "ĺ": 185, + "Ļ": 186, + "ļ": 187, + "Ľ": 188, + "ľ": 189, + "Ŀ": 190, + "ŀ": 191, + "Ł": 192, + "ł": 193, + "Ń": 194, + "e": 195, + "d": 196, + "a": 197, + "o": 198, + "n": 199, + "±": 200, + "l": 201, + "m": 202, + "h": 203, + "r": 204, + "i": 205, + "s": 206, + "Z": 207, + "t": 208, + "f": 209, + "k": 210, + "y": 211, + "b": 212, + "F": 213, + "g": 214, + "7": 215, + "0": 216, + "p": 217, + "L": 218, + "H": 219, + "¡": 220, + "Ī": 221, + "1": 222, + "Ģ": 223, + "c": 224, + "ĩ": 225, + "6": 226, + "A": 227, + "z": 228, + "u": 229, + "S": 230, + "2": 231, + "v": 232, + "4": 233, + "M": 234, + "T": 235, + "8": 236, + "I": 237, + "N": 238, + "C": 239, + "5": 240, + "¹": 241, + "9": 242, + "3": 243, + "ī": 244, + "P": 245, + "E": 246, + "»": 247, + "V": 248, + "İ": 249, + "w": 250, + "J": 251, + "ł": 252, + ".": 253, + "K": 254, + "D": 255, + "Ķ": 256, + "¸": 257, + "B": 258, + "©": 259, + "º": 260, + "µ": 261, + "Ĥ": 262, + "X": 263, + "R": 264, + "O": 265, + "«": 266, + "Ļ": 267, + "U": 268, + "x": 269, + "[": 270, + "¿": 271, + "³": 272, + "ģ": 273, + "W": 274, + "§": 275, + "-": 276, + "ĸ": 277, + "Ħ": 278, + ",": 279, + "q": 280, + "ħ": 281, + "¨": 282, + "G": 283, + "²": 284, + "ĺ": 285, + "ª": 286, + "¯": 287, + "j": 288, + "]": 289, + "ļ": 290, + "Ŀ": 291, + "¤": 292, + "ŀ": 293, + "½": 294, + "IJ": 295, + "'": 296, + "Ń": 297, + "°": 298, + "ľ": 299, + ">": 300, + "¶": 301, + "į": 302, + "¦": 303, + "|": 304, + "¼": 305, + "¢": 306, + "´": 307, + "Ĩ": 308, + "Q": 309, + "Y": 310, + "Ľ": 311, + "ĵ": 312, + "ij": 313, + "ķ": 314, + "Ĭ": 315, + "¾": 316, + ";": 317, + "(": 318, + "¬": 319, + "@": 320, + "ĭ": 321, + "Ĺ": 322, + "£": 323, + "Į": 324, + "#": 325, + "·": 326, + "*": 327, + "Ĵ": 328, + "®": 329, + ")": 330, + "^": 331, + "ı": 332, + "Ġ": 333, + "_": 334, + "Ł": 335, + "}": 336, + "ĥ": 337, + "\\": 338, + "¥": 339, + "<": 340, + "+": 341, + "=": 342, + "~": 343, + "\"": 344, + "!": 345, + "?": 346, + "`": 347, + "$": 348, + "Ċ": 349, + "/": 350, + "%": 351, + "&": 352, + ":": 353, + "Ġt": 354, + "Ġth": 355, + "Ġa": 356, + "Ġthe": 357, + "in": 358, + "Ġo": 359, + "Ġ,": 360, + "Ġs": 361, + "ed": 362, + "Ġw": 363, + "er": 364, + "Ġ.": 365, + "Ġi": 366, + "re": 367, + "Ġc": 368, + "nd": 369, + "Ġf": 370, + "Ġb": 371, + "at": 372, + "Ġof": 373, + "er": 374, + "en": 375, + "ar": 376, + "or": 377, + "it": 378, + "Ġp": 379, + "Ġh": 380, + "Ġand": 381, + "on": 382, + "ing": 383, + "an": 384, + "ro": 385, + "Ġm": 386, + "Ġd": 387, + "es": 388, + "Ġin": 389, + "on": 390, + "Ġto": 391, + "ou": 392, + "is": 393, + "Ġa": 394, + "ic": 395, + "ĠT": 396, + "al": 397, + "Ġl": 398, + "Ġ=": 399, + "Ġre": 400, + "Ġ\"": 401, + "es": 402, + "ĠS": 403, + "as": 404, + "al": 405, + "il": 406, + "el": 407, + "ion": 408, + "ĠA": 409, + "ĠC": 410, + "Ġ1": 411, + "ĠĊ": 412, + "ur": 413, + "ĠTh": 414, + "Ġn": 415, + "as": 416, + "Ġ@": 417, + "ec": 418, + "om": 419, + "ac": 420, + "Ġe": 421, + "Ġwas": 422, + "ĠM": 423, + "or": 424, + "an": 425, + "am": 426, + "en": 427, + "ol": 428, + "Ġin": 429, + "Ġg": 430, + "Ġ'": 431, + "ĠB": 432, + "ly": 433, + "at": 434, + "iv": 435, + "ts": 436, + "ĠThe": 437, + "us": 438, + "-@": 439, + "Ġ@-@": 440, + "is": 441, + "ĠI": 442, + "Ġwh": 443, + "ig": 444, + "ĠH": 445, + "Ġst": 446, + "os": 447, + "un": 448, + "th": 449, + "ĠP": 450, + "Ġwit": 451, + "Ġthat": 452, + "ir": 453, + "Ġas": 454, + "em": 455, + "Ġon": 456, + "ra": 457, + "Ġfor": 458, + "ĠR": 459, + "et": 460, + "ow": 461, + "Ġ2": 462, + "id": 463, + "ĠD": 464, + "le": 465, + "Ġwith": 466, + "la": 467, + "ent": 468, + "im": 469, + "ĠF": 470, + "ea": 471, + "ion": 472, + "Ġby": 473, + "Ġ)": 474, + "Ġ(": 475, + "Ġal": 476, + "Ġcon": 477, + "ent": 478, + "ĠW": 479, + "Ġis": 480, + "ere": 481, + "ĠG": 482, + "ĠN": 483, + "ĠL": 484, + "Ġha": 485, + "ers": 486, + "ri": 487, + "th": 488, + "ted": 489, + "uc": 490, + "ĠJ": 491, + "Ġ19": 492, + "ev": 493, + "ul": 494, + "Ġv": 495, + "ce": 496, + "ation": 497, + "rom": 498, + "Ġbe": 499, + "ĠE": 500, + "in": 501, + "Ġthe": 502, + "Ġfrom": 503, + "ĠO": 504, + "ter": 505, + "Ġpro": 506, + "Ġar": 507, + "ad": 508, + "Ġcom": 509, + "ic": 510, + "ag": 511, + "Ġhis": 512, + "Ġsh": 513, + "Ġat": 514, + "ov": 515, + "ies": 516, + "oo": 517, + "pp": 518, + "st": 519, + "ch": 520, + "Ġr": 521, + "Ġ20": 522, + "ay": 523, + "if": 524, + "Ġwere": 525, + "Ġch": 526, + "ut": 527, + "st": 528, + "ut": 529, + "ds": 530, + "op": 531, + "um": 532, + "Ġit": 533, + "oc": 534, + "ter": 535, + "le": 536, + "igh": 537, + "ud": 538, + "Ġex": 539, + "ions": 540, + "ate": 541, + "ity": 542, + "ated": 543, + "Ġun": 544, + "ep": 545, + "qu": 546, + "Ġno": 547, + "ĠK": 548, + "ive": 549, + "ist": 550, + "Ġon": 551, + "ame": 552, + "oun": 553, + "ir": 554, + "ab": 555, + "Ġâ": 556, + "ing": 557, + "Ġhe": 558, + "ld": 559, + "ug": 560, + "ich": 561, + "Ġan": 562, + "ed": 563, + "Ġk": 564, + "ĠâĢ": 565, + "Ġhad": 566, + "ve": 567, + "ain": 568, + "Ġse": 569, + "tion": 570, + "ore": 571, + "res": 572, + "Ġwhich": 573, + "ĠIn": 574, + "od": 575, + "ther": 576, + "ak": 577, + "Ġsp": 578, + "ar": 579, + "Ġy": 580, + "ĠCh": 581, + "ong": 582, + "Ġac": 583, + "est": 584, + "ĠU": 585, + "ap": 586, + "ff": 587, + "ally": 588, + "rit": 589, + "ĠSt": 590, + "ub": 591, + "ge": 592, + "ber": 593, + "et": 594, + "Ġbe": 595, + "ear": 596, + "Ġrec": 597, + "ers": 598, + "Ġfir": 599, + "ot": 600, + "Ġare": 601, + "Ġan": 602, + "ch": 603, + "og": 604, + "ia": 605, + "est": 606, + "ine": 607, + "ill": 608, + "and": 609, + "el": 610, + "ary": 611, + "ew": 612, + "id": 613, + "Ġfor": 614, + "Ġ;": 615, + "Ġcomp": 616, + "ĠV": 617, + "Ġinc": 618, + "tr": 619, + "Ġ200": 620, + "Ġtheir": 621, + "us": 622, + "Ġbut": 623, + "ran": 624, + "ical": 625, + "Ġfirst": 626, + "Ġde": 627, + "Ġint": 628, + "Ġro": 629, + "so": 630, + "ĠâĢĵ": 631, + "Ġnot": 632, + "ding": 633, + "fter": 634, + "ure": 635, + "Ġpar": 636, + "Ġ:": 637, + "ian": 638, + "Ġtw": 639, + "ould": 640, + "Ġalso": 641, + "Ġits": 642, + "Ġwor": 643, + "um": 644, + "Ġor": 645, + "ost": 646, + "00": 647, + "our": 648, + "ard": 649, + "Ġres": 650, + "mp": 651, + "ue": 652, + "Ġab": 653, + "ish": 654, + "Ġcont": 655, + "Ġad": 656, + "own": 657, + "all": 658, + "oug": 659, + "Ġher": 660, + "ast": 661, + "Ġen": 662, + "ome": 663, + "all": 664, + "ded": 665, + "ow": 666, + "Ġhave": 667, + "Ġus": 668, + "ear": 669, + "ack": 670, + "duc": 671, + "ial": 672, + "ss": 673, + "ents": 674, + "ain": 675, + "ting": 676, + "Ġone": 677, + "ess": 678, + "Ġhas": 679, + "ight": 680, + "av": 681, + "Ġev": 682, + "out": 683, + "ay": 684, + "ence": 685, + "Ġbeen": 686, + "ew": 687, + "Ġtwo": 688, + "Ġcl": 689, + "der": 690, + "ime": 691, + "ks": 692, + "ess": 693, + "ish": 694, + ".@": 695, + "Ġ@.@": 696, + "Ġpla": 697, + "Ġpl": 698, + "Ġor": 699, + "up": 700, + "ment": 701, + "uring": 702, + "oll": 703, + "ĠIn": 704, + "Ġthis": 705, + "Ġbec": 706, + "Ġcomm": 707, + "Ġdis": 708, + "ater": 709, + "age": 710, + "Ġapp": 711, + "ous": 712, + "ey": 713, + "il": 714, + "per": 715, + "ĠAl": 716, + "ional": 717, + "lud": 718, + "ely": 719, + "tt": 720, + "ile": 721, + "iz": 722, + "Ġj": 723, + "Ġwho": 724, + "Ġag": 725, + "ib": 726, + "Ġthey": 727, + "for": 728, + "Ġov": 729, + "ath": 730, + "eg": 731, + "Ġsc": 732, + "ip": 733, + "Ġ201": 734, + "Ġ3": 735, + "Ġper": 736, + "ory": 737, + "Ġdes": 738, + "ide": 739, + "Ġser": 740, + "se": 741, + "ĠHe": 742, + "land": 743, + "ations": 744, + "ric": 745, + "it": 746, + "res": 747, + "ered": 748, + "Ġpre": 749, + "ĠSh": 750, + "ance": 751, + "ort": 752, + "ant": 753, + ",@": 754, + "Ġ@,@": 755, + "ell": 756, + "ĠY": 757, + "ned": 758, + "ell": 759, + "ite": 760, + "Ġinclud": 761, + "Ġrep": 762, + "Ġafter": 763, + "Ġsuc": 764, + "ree": 765, + "any": 766, + "im": 767, + "ort": 768, + "Ġ18": 769, + "Ġsu": 770, + "ade": 771, + "our": 772, + "ĠUn": 773, + "ĠIt": 774, + "ik": 775, + "ĠMar": 776, + "ember": 777, + "Ġ1": 778, + "een": 779, + "and": 780, + "Ġsec": 781, + "ice": 782, + "Ġtime": 783, + "ĠAn": 784, + "Ġinto": 785, + "Ġfin": 786, + "Ġother": 787, + "Ġatt": 788, + "ill": 789, + "ren": 790, + "ach": 791, + "ass": 792, + "eral": 793, + "ese": 794, + "sh": 795, + "als": 796, + "ition": 797, + "ough": 798, + "les": 799, + "amp": 800, + "Ġwould": 801, + "Ġmore": 802, + "roug": 803, + "rib": 804, + "ery": 805, + "ace": 806, + "ĠA": 807, + "Ġplay": 808, + "ited": 809, + "ked": 810, + "ist": 811, + "ied": 812, + "Ġ2": 813, + "ased": 814, + "ings": 815, + "ang": 816, + "am": 817, + "ip": 818, + "Ġbo": 819, + "able": 820, + "ty": 821, + "Ġchar": 822, + "Ġcent": 823, + "etw": 824, + "ates": 825, + "rop": 826, + "ĠI": 827, + "und": 828, + "ĠAm": 829, + "ces": 830, + "oin": 831, + "Ġinter": 832, + "up": 833, + "ct": 834, + "one": 835, + "Ġtra": 836, + "ant": 837, + "ect": 838, + "Ġall": 839, + "ef": 840, + "Ġcons": 841, + "ubl": 842, + "ning": 843, + "ans": 844, + "Ġfe": 845, + "ust": 846, + "Ġ0": 847, + "Ġrem": 848, + "ase": 849, + "ong": 850, + "Ġwhen": 851, + "eb": 852, + "ĠWh": 853, + "Ġear": 854, + "ever": 855, + "Ġover": 856, + "Ġkn": 857, + "aus": 858, + "Ġpos": 859, + "ad": 860, + "erm": 861, + "Ġshe": 862, + "Ġra": 863, + "Ġduring": 864, + "ason": 865, + "vi": 866, + "Ġexp": 867, + "Ġlea": 868, + "Ġel": 869, + "Ġ4": 870, + "Ġonly": 871, + "ond": 872, + "Ġdec": 873, + "Ġacc": 874, + "Ġoff": 875, + "iss": 876, + "Ġfl": 877, + "ĠEn": 878, + "ot": 879, + "ens": 880, + "ose": 881, + "ake": 882, + "om": 883, + "Ġsev": 884, + "ach": 885, + "etween": 886, + "ern": 887, + "Ġ3": 888, + "Ġpr": 889, + "Ġgro": 890, + "ruc": 891, + "Ġdi": 892, + "Ġ199": 893, + "ĠAr": 894, + "Ġgame": 895, + "Ġhim": 896, + "ook": 897, + "Ġup": 898, + "Ġabout": 899, + "Ġrel": 900, + "form": 901, + "Ġthree": 902, + "att": 903, + "ĠCom": 904, + "Ġsa": 905, + "ears": 906, + "Ġ5": 907, + "ry": 908, + "Ġimp": 909, + "Ġmost": 910, + "fer": 911, + "Ġpres": 912, + "Ġfil": 913, + "Ġbetween": 914, + "Ġbeg": 915, + "ph": 916, + "ors": 917, + "Ġthan": 918, + "Ġrecor": 919, + "ob": 920, + "eric": 921, + "ating": 922, + "Ġthroug": 923, + "king": 924, + "Ġout": 925, + "Ġnum": 926, + "ood": 927, + "ollow": 928, + "act": 929, + "uil": 930, + "Ġcre": 931, + "olog": 932, + "ational": 933, + "Ġproduc": 934, + "Ġwhile": 935, + "Ġlater": 936, + "Ġwrit": 937, + "ex": 938, + "Ġstar": 939, + "Ġspec": 940, + "ee": 941, + "ished": 942, + "Ġreg": 943, + "ision": 944, + "outh": 945, + "Ġrele": 946, + "Ġass": 947, + "Ġseason": 948, + "Ġmade": 949, + "ily": 950, + "ru": 951, + "oy": 952, + "tur": 953, + "te": 954, + "Ġqu": 955, + "Ġmov": 956, + "ury": 957, + "ĠAmeric": 958, + "ement": 959, + "cc": 960, + "ound": 961, + "Ġlar": 962, + "Ġform": 963, + "ect": 964, + "Ġdef": 965, + "Ġmus": 966, + "ĠPar": 967, + "Ġme": 968, + "Ġsub": 969, + "way": 970, + "op": 971, + "oh": 972, + "eld": 973, + "ie": 974, + "emp": 975, + "ames": 976, + "ern": 977, + "Ġnor": 978, + "ived": 979, + "evel": 980, + "Ġsuch": 981, + "ards": 982, + "Ġind": 983, + "ike": 984, + "Ġgen": 985, + "ert": 986, + "Ġyear": 987, + "Ġused": 988, + "Ġnew": 989, + "Ġ5": 990, + "Ġalb": 991, + "sp": 992, + "yp": 993, + "Ġwith": 994, + "Ġwhere": 995, + "ics": 996, + "ĠThis": 997, + "Ġthem": 998, + "wn": 999 }, - "post_processor": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": false - }, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": true - }, - "model": { - "type": "BPE", - "dropout": null, - "unk_token": null, - "continuing_subword_prefix": "", - "end_of_word_suffix": "", - "fuse_unk": false, - "vocab": { - "<|startoftext|>": 0, - "<|endoftext|>": 1, - "!": 2, - "\"": 3, - "#": 4, - "$": 5, - "%": 6, - "&": 7, - "'": 8, - "(": 9, - ")": 10, - "*": 11, - "+": 12, - ",": 13, - "-": 14, - ".": 15, - "/": 16, - "0": 17, - "1": 18, - "2": 19, - "3": 20, - "4": 21, - "5": 22, - "6": 23, - "7": 24, - "8": 25, - "9": 26, - ":": 27, - ";": 28, - "<": 29, - "=": 30, - ">": 31, - "?": 32, - "@": 33, - "A": 34, - "B": 35, - "C": 36, - "D": 37, - "E": 38, - "F": 39, - "G": 40, - "H": 41, - "I": 42, - "J": 43, - "K": 44, - "L": 45, - "M": 46, - "N": 47, - "O": 48, - "P": 49, - "Q": 50, - "R": 51, - "S": 52, - "T": 53, - "U": 54, - "V": 55, - "W": 56, - "X": 57, - "Y": 58, - "Z": 59, - "[": 60, - "\\": 61, - "]": 62, - "^": 63, - "_": 64, - "`": 65, - "a": 66, - "b": 67, - "c": 68, - "d": 69, - "e": 70, - "f": 71, - "g": 72, - "h": 73, - "i": 74, - "j": 75, - "k": 76, - "l": 77, - "m": 78, - "n": 79, - "o": 80, - "p": 81, - "q": 82, - "r": 83, - "s": 84, - "t": 85, - "u": 86, - "v": 87, - "w": 88, - "x": 89, - "y": 90, - "z": 91, - "|": 92, - "}": 93, - "~": 94, - "¡": 95, - "¢": 96, - "£": 97, - "¤": 98, - "¥": 99 - }, - "merges": [] - } -} + "merges": [ + "Ġ t", + "Ġt h", + "Ġ a", + "Ġth e", + "i n", + "Ġ o", + "Ġ ,", + "Ġ s", + "e d", + "Ġ w", + "e r", + "Ġ .", + "Ġ i", + "r e", + "Ġ c", + "n d", + "Ġ f", + "Ġ b", + "a t", + "Ġo f", + "e r", + "e n", + "a r", + "o r", + "i t", + "Ġ p", + "Ġ h", + "Ġa nd", + "o n", + "in g", + "a n", + "r o", + "Ġ m", + "Ġ d", + "e s", + "Ġi n", + "o n", + "Ġt o", + "o u", + "i s", + "Ġ a", + "i c", + "Ġ T", + "a l", + "Ġ l", + "Ġ =", + "Ġ re", + "Ġ \"", + "e s", + "Ġ S", + "a s", + "a l", + "i l", + "e l", + "i on", + "Ġ A", + "Ġ C", + "Ġ 1", + "Ġ Ċ", + "u r", + "ĠT h", + "Ġ n", + "a s", + "Ġ @", + "e c", + "o m", + "a c", + "Ġ e", + "Ġw as", + "Ġ M", + "o r", + "a n", + "a m", + "e n", + "o l", + "Ġ in", + "Ġ g", + "Ġ '", + "Ġ B", + "l y", + "a t", + "i v", + "t s", + "ĠTh e", + "u s", + "- @", + "Ġ@ -@", + "i s", + "Ġ I", + "Ġw h", + "i g", + "Ġ H", + "Ġs t", + "o s", + "u n", + "t h", + "Ġ P", + "Ġw it", + "Ġth at", + "i r", + "Ġa s", + "e m", + "Ġo n", + "r a", + "Ġf or", + "Ġ R", + "e t", + "o w", + "Ġ 2", + "i d", + "Ġ D", + "l e", + "Ġwit h", + "l a", + "en t", + "i m", + "Ġ F", + "e a", + "i on", + "Ġb y", + "Ġ )", + "Ġ (", + "Ġa l", + "Ġc on", + "en t", + "Ġ W", + "Ġi s", + "er e", + "Ġ G", + "Ġ N", + "Ġ L", + "Ġh a", + "er s", + "r i", + "t h", + "t ed", + "u c", + "Ġ J", + "Ġ1 9", + "e v", + "u l", + "Ġ v", + "c e", + "at ion", + "ro m", + "Ġb e", + "Ġ E", + "i n", + "Ġth e", + "Ġf rom", + "Ġ O", + "t er", + "Ġp ro", + "Ġa r", + "a d", + "Ġc om", + "i c", + "a g", + "Ġh is", + "Ġs h", + "Ġa t", + "o v", + "i es", + "o o", + "p p", + "s t", + "c h", + "Ġ r", + "Ġ2 0", + "a y", + "i f", + "Ġw ere", + "Ġc h", + "u t", + "s t", + "u t", + "d s", + "o p", + "u m", + "Ġi t", + "o c", + "t er", + "l e", + "ig h", + "u d", + "Ġe x", + "ion s", + "at e", + "it y", + "at ed", + "Ġ un", + "e p", + "q u", + "Ġn o", + "Ġ K", + "iv e", + "is t", + "Ġo n", + "am e", + "ou n", + "i r", + "a b", + "Ġ â", + "in g", + "Ġh e", + "l d", + "u g", + "ic h", + "Ġa n", + "e d", + "Ġ k", + "Ġâ Ģ", + "Ġha d", + "v e", + "a in", + "Ġs e", + "t ion", + "or e", + "re s", + "Ġwh ich", + "ĠI n", + "o d", + "th er", + "a k", + "Ġs p", + "a r", + "Ġ y", + "ĠC h", + "on g", + "Ġa c", + "es t", + "Ġ U", + "a p", + "f f", + "al ly", + "r it", + "ĠS t", + "u b", + "g e", + "b er", + "e t", + "Ġb e", + "e ar", + "Ġre c", + "er s", + "Ġf ir", + "o t", + "Ġar e", + "Ġa n", + "c h", + "o g", + "i a", + "es t", + "in e", + "il l", + "an d", + "e l", + "ar y", + "e w", + "i d", + "Ġf or", + "Ġ ;", + "Ġcom p", + "Ġ V", + "Ġin c", + "t r", + "Ġ20 0", + "Ġthe ir", + "u s", + "Ġb ut", + "r an", + "ic al", + "Ġfir st", + "Ġd e", + "Ġin t", + "Ġ ro", + "s o", + "ĠâĢ ĵ", + "Ġno t", + "d ing", + "f ter", + "ur e", + "Ġp ar", + "Ġ :", + "i an", + "Ġt w", + "ou ld", + "Ġal so", + "Ġi ts", + "Ġw or", + "u m", + "Ġo r", + "os t", + "0 0", + "ou r", + "ar d", + "Ġre s", + "m p", + "u e", + "Ġa b", + "is h", + "Ġcon t", + "Ġa d", + "ow n", + "al l", + "ou g", + "Ġh er", + "as t", + "Ġ en", + "om e", + "al l", + "d ed", + "o w", + "Ġha ve", + "Ġ us", + "ea r", + "ac k", + "d uc", + "i al", + "s s", + "en ts", + "a in", + "t ing", + "Ġon e", + "es s", + "Ġh as", + "igh t", + "a v", + "Ġe v", + "ou t", + "a y", + "en ce", + "Ġbe en", + "e w", + "Ġtw o", + "Ġc l", + "d er", + "im e", + "k s", + "es s", + "is h", + ". @", + "Ġ@ .@", + "Ġp la", + "Ġp l", + "Ġo r", + "u p", + "m ent", + "ur ing", + "ol l", + "ĠI n", + "Ġth is", + "Ġb ec", + "Ġcom m", + "Ġd is", + "at er", + "ag e", + "Ġa pp", + "ou s", + "e y", + "i l", + "p er", + "ĠA l", + "ion al", + "l ud", + "el y", + "t t", + "il e", + "i z", + "Ġ j", + "Ġwh o", + "Ġa g", + "i b", + "Ġthe y", + "f or", + "Ġo v", + "at h", + "e g", + "Ġs c", + "i p", + "Ġ20 1", + "Ġ 3", + "Ġp er", + "or y", + "Ġd es", + "id e", + "Ġs er", + "s e", + "ĠH e", + "la nd", + "at ions", + "r ic", + "i t", + "re s", + "er ed", + "Ġp re", + "ĠS h", + "an ce", + "or t", + "an t", + ", @", + "Ġ@ ,@", + "el l", + "Ġ Y", + "n ed", + "el l", + "it e", + "Ġinc lud", + "Ġre p", + "Ġa fter", + "Ġs uc", + "re e", + "an y", + "i m", + "or t", + "Ġ1 8", + "Ġs u", + "ad e", + "ou r", + "ĠU n", + "ĠI t", + "i k", + "ĠM ar", + "em ber", + "Ġ 1", + "e en", + "a nd", + "Ġs ec", + "ic e", + "Ġt ime", + "ĠA n", + "Ġint o", + "Ġf in", + "Ġo ther", + "Ġa tt", + "il l", + "re n", + "ac h", + "as s", + "er al", + "es e", + "s h", + "al s", + "it ion", + "oug h", + "l es", + "am p", + "Ġw ould", + "Ġm ore", + "ro ug", + "ri b", + "er y", + "ac e", + "Ġ A", + "Ġpla y", + "it ed", + "k ed", + "is t", + "i ed", + "Ġ 2", + "as ed", + "ing s", + "an g", + "a m", + "i p", + "Ġb o", + "ab le", + "t y", + "Ġch ar", + "Ġc ent", + "et w", + "at es", + "ro p", + "Ġ I", + "u nd", + "ĠA m", + "c es", + "o in", + "Ġin ter", + "u p", + "c t", + "on e", + "Ġt ra", + "an t", + "ec t", + "Ġal l", + "e f", + "Ġcon s", + "ub l", + "n ing", + "an s", + "Ġf e", + "us t", + "Ġ 0", + "Ġre m", + "as e", + "on g", + "Ġwh en", + "e b", + "ĠW h", + "Ġe ar", + "ev er", + "Ġov er", + "Ġk n", + "a us", + "Ġp os", + "a d", + "er m", + "Ġsh e", + "Ġ ra", + "Ġd uring", + "as on", + "v i", + "Ġex p", + "Ġl ea", + "Ġ el", + "Ġ 4", + "Ġon ly", + "o nd", + "Ġd ec", + "Ġac c", + "Ġo ff", + "is s", + "Ġf l", + "ĠE n", + "o t", + "en s", + "os e", + "ak e", + "o m", + "Ġs ev", + "ac h", + "etw een", + "er n", + "Ġ 3", + "Ġp r", + "Ġg ro", + "r uc", + "Ġd i", + "Ġ19 9", + "ĠA r", + "Ġg ame", + "Ġh im", + "oo k", + "Ġ up", + "Ġab out", + "Ġre l", + "for m", + "Ġth ree", + "at t", + "ĠC om", + "Ġs a", + "ear s", + "Ġ 5", + "r y", + "Ġi mp", + "Ġm ost", + "f er", + "Ġp res", + "Ġf il", + "Ġb etween", + "Ġbe g", + "p h", + "or s", + "Ġth an", + "Ġrec or", + "o b", + "er ic", + "at ing", + "Ġth roug", + "k ing", + "Ġo ut", + "Ġn um", + "oo d", + "oll ow", + "ac t", + "u il", + "Ġc re", + "ol og", + "at ional", + "Ġpro duc", + "Ġwh ile", + "Ġl ater", + "Ġw rit", + "e x", + "Ġst ar", + "Ġsp ec", + "e e", + "ish ed", + "Ġre g", + "is ion", + "ou th", + "Ġre le", + "Ġa ss", + "Ġse ason", + "Ġm ade", + "il y", + "r u", + "o y", + "t ur", + "t e", + "Ġ qu", + "Ġm ov", + "ur y", + "ĠAm eric", + "em ent", + "c c", + "ou nd", + "Ġl ar", + "Ġfor m", + "ec t", + "Ġde f", + "Ġm us", + "ĠP ar", + "Ġm e", + "Ġs ub", + "w ay", + "o p", + "o h", + "el d", + "i e", + "em p", + "am es", + "er n", + "Ġn or", + "iv ed", + "ev el", + "Ġsuc h", + "ar ds", + "Ġin d", + "ik e", + "Ġg en", + "er t", + "Ġy ear", + "Ġus ed", + "Ġn ew", + "Ġ 5", + "Ġal b", + "s p", + "y p", + "Ġwit h", + "Ġwh ere", + "ic s", + "ĠTh is", + "Ġthe m", + "w n" + ] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json index 4b24d1c..e6372c2 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -1 +1 @@ -{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "openai/clip-vit-base-patch32", "model_max_length": 77, "special_tokens_map_file": "/home/lysandre/.cache/huggingface/transformers/18a566598f286c9139f88160c99f84eec492a26bd22738fa9cb44d5b7e0a5c76.cce1206abbad28826f000510f22f354e53e66a97f7c23745a7dfe27609cc07f5", "tokenizer_class": "CLIPTokenizer"} \ No newline at end of file +{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "hf-internal-testing/tiny-random-clip", "model_max_length": 77, "special_tokens_map_file": "/home/lysandre/.cache/huggingface/transformers/18a566598f286c9139f88160c99f84eec492a26bd22738fa9cb44d5b7e0a5c76.cce1206abbad28826f000510f22f354e53e66a97f7c23745a7dfe27609cc07f5", "from_slow": true, "tokenizer_class": "CLIPTokenizer"} \ No newline at end of file