From 6403b73462056af26c6b84418152bc2474a86be9 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Mon, 14 Nov 2022 18:44:26 +0000 Subject: [PATCH] Upload AudioSpectrogramTransformerForSequenceClassification --- config.json | 1081 +++++++++++++++++++++++++++++++++++++++++++++ pytorch_model.bin | 3 + 2 files changed, 1084 insertions(+) create mode 100644 config.json create mode 100644 pytorch_model.bin diff --git a/config.json b/config.json new file mode 100644 index 0000000..f2d2112 --- /dev/null +++ b/config.json @@ -0,0 +1,1081 @@ +{ + "architectures": [ + "AudioSpectrogramTransformerForSequenceClassification" + ], + "attention_probs_dropout_prob": 0.0, + "frequency_dimension": 128, + "frequency_stride": 10, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 768, + "id2label": { + "0": "Speech", + "1": "Male speech, man speaking", + "2": "Female speech, woman speaking", + "3": "Child speech, kid speaking", + "4": "Conversation", + "5": "Narration, monologue", + "6": "Babbling", + "7": "Speech synthesizer", + "8": "Shout", + "9": "Bellow", + "10": "Whoop", + "11": "Yell", + "12": "Battle cry", + "13": "Children shouting", + "14": "Screaming", + "15": "Whispering", + "16": "Laughter", + "17": "Baby laughter", + "18": "Giggle", + "19": "Snicker", + "20": "Belly laugh", + "21": "Chuckle, chortle", + "22": "Crying, sobbing", + "23": "Baby cry, infant cry", + "24": "Whimper", + "25": "Wail, moan", + "26": "Sigh", + "27": "Singing", + "28": "Choir", + "29": "Yodeling", + "30": "Chant", + "31": "Mantra", + "32": "Male singing", + "33": "Female singing", + "34": "Child singing", + "35": "Synthetic singing", + "36": "Rapping", + "37": "Humming", + "38": "Groan", + "39": "Grunt", + "40": "Whistling", + "41": "Breathing", + "42": "Wheeze", + "43": "Snoring", + "44": "Gasp", + "45": "Pant", + "46": "Snort", + "47": "Cough", + "48": "Throat clearing", + "49": "Sneeze", + "50": "Sniff", + "51": "Run", + "52": "Shuffle", + "53": "Walk, footsteps", + "54": "Chewing, mastication", + "55": "Biting", + "56": "Gargling", + "57": "Stomach rumble", + "58": "Burping, eructation", + "59": "Hiccup", + "60": "Fart", + "61": "Hands", + "62": "Finger snapping", + "63": "Clapping", + "64": "Heart sounds, heartbeat", + "65": "Heart murmur", + "66": "Cheering", + "67": "Applause", + "68": "Chatter", + "69": "Crowd", + "70": "Hubbub, speech noise, speech babble", + "71": "Children playing", + "72": "Animal", + "73": "Domestic animals, pets", + "74": "Dog", + "75": "Bark", + "76": "Yip", + "77": "Howl", + "78": "Bow-wow", + "79": "Growling", + "80": "Whimper (dog)", + "81": "Cat", + "82": "Purr", + "83": "Meow", + "84": "Hiss", + "85": "Caterwaul", + "86": "Livestock, farm animals, working animals", + "87": "Horse", + "88": "Clip-clop", + "89": "Neigh, whinny", + "90": "Cattle, bovinae", + "91": "Moo", + "92": "Cowbell", + "93": "Pig", + "94": "Oink", + "95": "Goat", + "96": "Bleat", + "97": "Sheep", + "98": "Fowl", + "99": "Chicken, rooster", + "100": "Cluck", + "101": "Crowing, cock-a-doodle-doo", + "102": "Turkey", + "103": "Gobble", + "104": "Duck", + "105": "Quack", + "106": "Goose", + "107": "Honk", + "108": "Wild animals", + "109": "Roaring cats (lions, tigers)", + "110": "Roar", + "111": "Bird", + "112": "Bird vocalization, bird call, bird song", + "113": "Chirp, tweet", + "114": "Squawk", + "115": "Pigeon, dove", + "116": "Coo", + "117": "Crow", + "118": "Caw", + "119": "Owl", + "120": "Hoot", + "121": "Bird flight, flapping wings", + "122": "Canidae, dogs, wolves", + "123": "Rodents, rats, mice", + "124": "Mouse", + "125": "Patter", + "126": "Insect", + "127": "Cricket", + "128": "Mosquito", + "129": "Fly, housefly", + "130": "Buzz", + "131": "Bee, wasp, etc.", + "132": "Frog", + "133": "Croak", + "134": "Snake", + "135": "Rattle", + "136": "Whale vocalization", + "137": "Music", + "138": "Musical instrument", + "139": "Plucked string instrument", + "140": "Guitar", + "141": "Electric guitar", + "142": "Bass guitar", + "143": "Acoustic guitar", + "144": "Steel guitar, slide guitar", + "145": "Tapping (guitar technique)", + "146": "Strum", + "147": "Banjo", + "148": "Sitar", + "149": "Mandolin", + "150": "Zither", + "151": "Ukulele", + "152": "Keyboard (musical)", + "153": "Piano", + "154": "Electric piano", + "155": "Organ", + "156": "Electronic organ", + "157": "Hammond organ", + "158": "Synthesizer", + "159": "Sampler", + "160": "Harpsichord", + "161": "Percussion", + "162": "Drum kit", + "163": "Drum machine", + "164": "Drum", + "165": "Snare drum", + "166": "Rimshot", + "167": "Drum roll", + "168": "Bass drum", + "169": "Timpani", + "170": "Tabla", + "171": "Cymbal", + "172": "Hi-hat", + "173": "Wood block", + "174": "Tambourine", + "175": "Rattle (instrument)", + "176": "Maraca", + "177": "Gong", + "178": "Tubular bells", + "179": "Mallet percussion", + "180": "Marimba, xylophone", + "181": "Glockenspiel", + "182": "Vibraphone", + "183": "Steelpan", + "184": "Orchestra", + "185": "Brass instrument", + "186": "French horn", + "187": "Trumpet", + "188": "Trombone", + "189": "Bowed string instrument", + "190": "String section", + "191": "Violin, fiddle", + "192": "Pizzicato", + "193": "Cello", + "194": "Double bass", + "195": "Wind instrument, woodwind instrument", + "196": "Flute", + "197": "Saxophone", + "198": "Clarinet", + "199": "Harp", + "200": "Bell", + "201": "Church bell", + "202": "Jingle bell", + "203": "Bicycle bell", + "204": "Tuning fork", + "205": "Chime", + "206": "Wind chime", + "207": "Change ringing (campanology)", + "208": "Harmonica", + "209": "Accordion", + "210": "Bagpipes", + "211": "Didgeridoo", + "212": "Shofar", + "213": "Theremin", + "214": "Singing bowl", + "215": "Scratching (performance technique)", + "216": "Pop music", + "217": "Hip hop music", + "218": "Beatboxing", + "219": "Rock music", + "220": "Heavy metal", + "221": "Punk rock", + "222": "Grunge", + "223": "Progressive rock", + "224": "Rock and roll", + "225": "Psychedelic rock", + "226": "Rhythm and blues", + "227": "Soul music", + "228": "Reggae", + "229": "Country", + "230": "Swing music", + "231": "Bluegrass", + "232": "Funk", + "233": "Folk music", + "234": "Middle Eastern music", + "235": "Jazz", + "236": "Disco", + "237": "Classical music", + "238": "Opera", + "239": "Electronic music", + "240": "House music", + "241": "Techno", + "242": "Dubstep", + "243": "Drum and bass", + "244": "Electronica", + "245": "Electronic dance music", + "246": "Ambient music", + "247": "Trance music", + "248": "Music of Latin America", + "249": "Salsa music", + "250": "Flamenco", + "251": "Blues", + "252": "Music for children", + "253": "New-age music", + "254": "Vocal music", + "255": "A capella", + "256": "Music of Africa", + "257": "Afrobeat", + "258": "Christian music", + "259": "Gospel music", + "260": "Music of Asia", + "261": "Carnatic music", + "262": "Music of Bollywood", + "263": "Ska", + "264": "Traditional music", + "265": "Independent music", + "266": "Song", + "267": "Background music", + "268": "Theme music", + "269": "Jingle (music)", + "270": "Soundtrack music", + "271": "Lullaby", + "272": "Video game music", + "273": "Christmas music", + "274": "Dance music", + "275": "Wedding music", + "276": "Happy music", + "277": "Funny music", + "278": "Sad music", + "279": "Tender music", + "280": "Exciting music", + "281": "Angry music", + "282": "Scary music", + "283": "Wind", + "284": "Rustling leaves", + "285": "Wind noise (microphone)", + "286": "Thunderstorm", + "287": "Thunder", + "288": "Water", + "289": "Rain", + "290": "Raindrop", + "291": "Rain on surface", + "292": "Stream", + "293": "Waterfall", + "294": "Ocean", + "295": "Waves, surf", + "296": "Steam", + "297": "Gurgling", + "298": "Fire", + "299": "Crackle", + "300": "Vehicle", + "301": "Boat, Water vehicle", + "302": "Sailboat, sailing ship", + "303": "Rowboat, canoe, kayak", + "304": "Motorboat, speedboat", + "305": "Ship", + "306": "Motor vehicle (road)", + "307": "Car", + "308": "Vehicle horn, car horn, honking", + "309": "Toot", + "310": "Car alarm", + "311": "Power windows, electric windows", + "312": "Skidding", + "313": "Tire squeal", + "314": "Car passing by", + "315": "Race car, auto racing", + "316": "Truck", + "317": "Air brake", + "318": "Air horn, truck horn", + "319": "Reversing beeps", + "320": "Ice cream truck, ice cream van", + "321": "Bus", + "322": "Emergency vehicle", + "323": "Police car (siren)", + "324": "Ambulance (siren)", + "325": "Fire engine, fire truck (siren)", + "326": "Motorcycle", + "327": "Traffic noise, roadway noise", + "328": "Rail transport", + "329": "Train", + "330": "Train whistle", + "331": "Train horn", + "332": "Railroad car, train wagon", + "333": "Train wheels squealing", + "334": "Subway, metro, underground", + "335": "Aircraft", + "336": "Aircraft engine", + "337": "Jet engine", + "338": "Propeller, airscrew", + "339": "Helicopter", + "340": "Fixed-wing aircraft, airplane", + "341": "Bicycle", + "342": "Skateboard", + "343": "Engine", + "344": "Light engine (high frequency)", + "345": "Dental drill, dentist's drill", + "346": "Lawn mower", + "347": "Chainsaw", + "348": "Medium engine (mid frequency)", + "349": "Heavy engine (low frequency)", + "350": "Engine knocking", + "351": "Engine starting", + "352": "Idling", + "353": "Accelerating, revving, vroom", + "354": "Door", + "355": "Doorbell", + "356": "Ding-dong", + "357": "Sliding door", + "358": "Slam", + "359": "Knock", + "360": "Tap", + "361": "Squeak", + "362": "Cupboard open or close", + "363": "Drawer open or close", + "364": "Dishes, pots, and pans", + "365": "Cutlery, silverware", + "366": "Chopping (food)", + "367": "Frying (food)", + "368": "Microwave oven", + "369": "Blender", + "370": "Water tap, faucet", + "371": "Sink (filling or washing)", + "372": "Bathtub (filling or washing)", + "373": "Hair dryer", + "374": "Toilet flush", + "375": "Toothbrush", + "376": "Electric toothbrush", + "377": "Vacuum cleaner", + "378": "Zipper (clothing)", + "379": "Keys jangling", + "380": "Coin (dropping)", + "381": "Scissors", + "382": "Electric shaver, electric razor", + "383": "Shuffling cards", + "384": "Typing", + "385": "Typewriter", + "386": "Computer keyboard", + "387": "Writing", + "388": "Alarm", + "389": "Telephone", + "390": "Telephone bell ringing", + "391": "Ringtone", + "392": "Telephone dialing, DTMF", + "393": "Dial tone", + "394": "Busy signal", + "395": "Alarm clock", + "396": "Siren", + "397": "Civil defense siren", + "398": "Buzzer", + "399": "Smoke detector, smoke alarm", + "400": "Fire alarm", + "401": "Foghorn", + "402": "Whistle", + "403": "Steam whistle", + "404": "Mechanisms", + "405": "Ratchet, pawl", + "406": "Clock", + "407": "Tick", + "408": "Tick-tock", + "409": "Gears", + "410": "Pulleys", + "411": "Sewing machine", + "412": "Mechanical fan", + "413": "Air conditioning", + "414": "Cash register", + "415": "Printer", + "416": "Camera", + "417": "Single-lens reflex camera", + "418": "Tools", + "419": "Hammer", + "420": "Jackhammer", + "421": "Sawing", + "422": "Filing (rasp)", + "423": "Sanding", + "424": "Power tool", + "425": "Drill", + "426": "Explosion", + "427": "Gunshot, gunfire", + "428": "Machine gun", + "429": "Fusillade", + "430": "Artillery fire", + "431": "Cap gun", + "432": "Fireworks", + "433": "Firecracker", + "434": "Burst, pop", + "435": "Eruption", + "436": "Boom", + "437": "Wood", + "438": "Chop", + "439": "Splinter", + "440": "Crack", + "441": "Glass", + "442": "Chink, clink", + "443": "Shatter", + "444": "Liquid", + "445": "Splash, splatter", + "446": "Slosh", + "447": "Squish", + "448": "Drip", + "449": "Pour", + "450": "Trickle, dribble", + "451": "Gush", + "452": "Fill (with liquid)", + "453": "Spray", + "454": "Pump (liquid)", + "455": "Stir", + "456": "Boiling", + "457": "Sonar", + "458": "Arrow", + "459": "Whoosh, swoosh, swish", + "460": "Thump, thud", + "461": "Thunk", + "462": "Electronic tuner", + "463": "Effects unit", + "464": "Chorus effect", + "465": "Basketball bounce", + "466": "Bang", + "467": "Slap, smack", + "468": "Whack, thwack", + "469": "Smash, crash", + "470": "Breaking", + "471": "Bouncing", + "472": "Whip", + "473": "Flap", + "474": "Scratch", + "475": "Scrape", + "476": "Rub", + "477": "Roll", + "478": "Crushing", + "479": "Crumpling, crinkling", + "480": "Tearing", + "481": "Beep, bleep", + "482": "Ping", + "483": "Ding", + "484": "Clang", + "485": "Squeal", + "486": "Creak", + "487": "Rustle", + "488": "Whir", + "489": "Clatter", + "490": "Sizzle", + "491": "Clicking", + "492": "Clickety-clack", + "493": "Rumble", + "494": "Plop", + "495": "Jingle, tinkle", + "496": "Hum", + "497": "Zing", + "498": "Boing", + "499": "Crunch", + "500": "Silence", + "501": "Sine wave", + "502": "Harmonic", + "503": "Chirp tone", + "504": "Sound effect", + "505": "Pulse", + "506": "Inside, small room", + "507": "Inside, large room or hall", + "508": "Inside, public space", + "509": "Outside, urban or manmade", + "510": "Outside, rural or natural", + "511": "Reverberation", + "512": "Echo", + "513": "Noise", + "514": "Environmental noise", + "515": "Static", + "516": "Mains hum", + "517": "Distortion", + "518": "Sidetone", + "519": "Cacophony", + "520": "White noise", + "521": "Pink noise", + "522": "Throbbing", + "523": "Vibration", + "524": "Television", + "525": "Radio", + "526": "Field recording" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "A capella": 255, + "Accelerating, revving, vroom": 353, + "Accordion": 209, + "Acoustic guitar": 143, + "Afrobeat": 257, + "Air brake": 317, + "Air conditioning": 413, + "Air horn, truck horn": 318, + "Aircraft": 335, + "Aircraft engine": 336, + "Alarm": 388, + "Alarm clock": 395, + "Ambient music": 246, + "Ambulance (siren)": 324, + "Angry music": 281, + "Animal": 72, + "Applause": 67, + "Arrow": 458, + "Artillery fire": 430, + "Babbling": 6, + "Baby cry, infant cry": 23, + "Baby laughter": 17, + "Background music": 267, + "Bagpipes": 210, + "Bang": 466, + "Banjo": 147, + "Bark": 75, + "Basketball bounce": 465, + "Bass drum": 168, + "Bass guitar": 142, + "Bathtub (filling or washing)": 372, + "Battle cry": 12, + "Beatboxing": 218, + "Bee, wasp, etc.": 131, + "Beep, bleep": 481, + "Bell": 200, + "Bellow": 9, + "Belly laugh": 20, + "Bicycle": 341, + "Bicycle bell": 203, + "Bird": 111, + "Bird flight, flapping wings": 121, + "Bird vocalization, bird call, bird song": 112, + "Biting": 55, + "Bleat": 96, + "Blender": 369, + "Bluegrass": 231, + "Blues": 251, + "Boat, Water vehicle": 301, + "Boiling": 456, + "Boing": 498, + "Boom": 436, + "Bouncing": 471, + "Bow-wow": 78, + "Bowed string instrument": 189, + "Brass instrument": 185, + "Breaking": 470, + "Breathing": 41, + "Burping, eructation": 58, + "Burst, pop": 434, + "Bus": 321, + "Busy signal": 394, + "Buzz": 130, + "Buzzer": 398, + "Cacophony": 519, + "Camera": 416, + "Canidae, dogs, wolves": 122, + "Cap gun": 431, + "Car": 307, + "Car alarm": 310, + "Car passing by": 314, + "Carnatic music": 261, + "Cash register": 414, + "Cat": 81, + "Caterwaul": 85, + "Cattle, bovinae": 90, + "Caw": 118, + "Cello": 193, + "Chainsaw": 347, + "Change ringing (campanology)": 207, + "Chant": 30, + "Chatter": 68, + "Cheering": 66, + "Chewing, mastication": 54, + "Chicken, rooster": 99, + "Child singing": 34, + "Child speech, kid speaking": 3, + "Children playing": 71, + "Children shouting": 13, + "Chime": 205, + "Chink, clink": 442, + "Chirp tone": 503, + "Chirp, tweet": 113, + "Choir": 28, + "Chop": 438, + "Chopping (food)": 366, + "Chorus effect": 464, + "Christian music": 258, + "Christmas music": 273, + "Chuckle, chortle": 21, + "Church bell": 201, + "Civil defense siren": 397, + "Clang": 484, + "Clapping": 63, + "Clarinet": 198, + "Classical music": 237, + "Clatter": 489, + "Clickety-clack": 492, + "Clicking": 491, + "Clip-clop": 88, + "Clock": 406, + "Cluck": 100, + "Coin (dropping)": 380, + "Computer keyboard": 386, + "Conversation": 4, + "Coo": 116, + "Cough": 47, + "Country": 229, + "Cowbell": 92, + "Crack": 440, + "Crackle": 299, + "Creak": 486, + "Cricket": 127, + "Croak": 133, + "Crow": 117, + "Crowd": 69, + "Crowing, cock-a-doodle-doo": 101, + "Crumpling, crinkling": 479, + "Crunch": 499, + "Crushing": 478, + "Crying, sobbing": 22, + "Cupboard open or close": 362, + "Cutlery, silverware": 365, + "Cymbal": 171, + "Dance music": 274, + "Dental drill, dentist's drill": 345, + "Dial tone": 393, + "Didgeridoo": 211, + "Ding": 483, + "Ding-dong": 356, + "Disco": 236, + "Dishes, pots, and pans": 364, + "Distortion": 517, + "Dog": 74, + "Domestic animals, pets": 73, + "Door": 354, + "Doorbell": 355, + "Double bass": 194, + "Drawer open or close": 363, + "Drill": 425, + "Drip": 448, + "Drum": 164, + "Drum and bass": 243, + "Drum kit": 162, + "Drum machine": 163, + "Drum roll": 167, + "Dubstep": 242, + "Duck": 104, + "Echo": 512, + "Effects unit": 463, + "Electric guitar": 141, + "Electric piano": 154, + "Electric shaver, electric razor": 382, + "Electric toothbrush": 376, + "Electronic dance music": 245, + "Electronic music": 239, + "Electronic organ": 156, + "Electronic tuner": 462, + "Electronica": 244, + "Emergency vehicle": 322, + "Engine": 343, + "Engine knocking": 350, + "Engine starting": 351, + "Environmental noise": 514, + "Eruption": 435, + "Exciting music": 280, + "Explosion": 426, + "Fart": 60, + "Female singing": 33, + "Female speech, woman speaking": 2, + "Field recording": 526, + "Filing (rasp)": 422, + "Fill (with liquid)": 452, + "Finger snapping": 62, + "Fire": 298, + "Fire alarm": 400, + "Fire engine, fire truck (siren)": 325, + "Firecracker": 433, + "Fireworks": 432, + "Fixed-wing aircraft, airplane": 340, + "Flamenco": 250, + "Flap": 473, + "Flute": 196, + "Fly, housefly": 129, + "Foghorn": 401, + "Folk music": 233, + "Fowl": 98, + "French horn": 186, + "Frog": 132, + "Frying (food)": 367, + "Funk": 232, + "Funny music": 277, + "Fusillade": 429, + "Gargling": 56, + "Gasp": 44, + "Gears": 409, + "Giggle": 18, + "Glass": 441, + "Glockenspiel": 181, + "Goat": 95, + "Gobble": 103, + "Gong": 177, + "Goose": 106, + "Gospel music": 259, + "Groan": 38, + "Growling": 79, + "Grunge": 222, + "Grunt": 39, + "Guitar": 140, + "Gunshot, gunfire": 427, + "Gurgling": 297, + "Gush": 451, + "Hair dryer": 373, + "Hammer": 419, + "Hammond organ": 157, + "Hands": 61, + "Happy music": 276, + "Harmonic": 502, + "Harmonica": 208, + "Harp": 199, + "Harpsichord": 160, + "Heart murmur": 65, + "Heart sounds, heartbeat": 64, + "Heavy engine (low frequency)": 349, + "Heavy metal": 220, + "Helicopter": 339, + "Hi-hat": 172, + "Hiccup": 59, + "Hip hop music": 217, + "Hiss": 84, + "Honk": 107, + "Hoot": 120, + "Horse": 87, + "House music": 240, + "Howl": 77, + "Hubbub, speech noise, speech babble": 70, + "Hum": 496, + "Humming": 37, + "Ice cream truck, ice cream van": 320, + "Idling": 352, + "Independent music": 265, + "Insect": 126, + "Inside, large room or hall": 507, + "Inside, public space": 508, + "Inside, small room": 506, + "Jackhammer": 420, + "Jazz": 235, + "Jet engine": 337, + "Jingle (music)": 269, + "Jingle bell": 202, + "Jingle, tinkle": 495, + "Keyboard (musical)": 152, + "Keys jangling": 379, + "Knock": 359, + "Laughter": 16, + "Lawn mower": 346, + "Light engine (high frequency)": 344, + "Liquid": 444, + "Livestock, farm animals, working animals": 86, + "Lullaby": 271, + "Machine gun": 428, + "Mains hum": 516, + "Male singing": 32, + "Male speech, man speaking": 1, + "Mallet percussion": 179, + "Mandolin": 149, + "Mantra": 31, + "Maraca": 176, + "Marimba, xylophone": 180, + "Mechanical fan": 412, + "Mechanisms": 404, + "Medium engine (mid frequency)": 348, + "Meow": 83, + "Microwave oven": 368, + "Middle Eastern music": 234, + "Moo": 91, + "Mosquito": 128, + "Motor vehicle (road)": 306, + "Motorboat, speedboat": 304, + "Motorcycle": 326, + "Mouse": 124, + "Music": 137, + "Music for children": 252, + "Music of Africa": 256, + "Music of Asia": 260, + "Music of Bollywood": 262, + "Music of Latin America": 248, + "Musical instrument": 138, + "Narration, monologue": 5, + "Neigh, whinny": 89, + "New-age music": 253, + "Noise": 513, + "Ocean": 294, + "Oink": 94, + "Opera": 238, + "Orchestra": 184, + "Organ": 155, + "Outside, rural or natural": 510, + "Outside, urban or manmade": 509, + "Owl": 119, + "Pant": 45, + "Patter": 125, + "Percussion": 161, + "Piano": 153, + "Pig": 93, + "Pigeon, dove": 115, + "Ping": 482, + "Pink noise": 521, + "Pizzicato": 192, + "Plop": 494, + "Plucked string instrument": 139, + "Police car (siren)": 323, + "Pop music": 216, + "Pour": 449, + "Power tool": 424, + "Power windows, electric windows": 311, + "Printer": 415, + "Progressive rock": 223, + "Propeller, airscrew": 338, + "Psychedelic rock": 225, + "Pulleys": 410, + "Pulse": 505, + "Pump (liquid)": 454, + "Punk rock": 221, + "Purr": 82, + "Quack": 105, + "Race car, auto racing": 315, + "Radio": 525, + "Rail transport": 328, + "Railroad car, train wagon": 332, + "Rain": 289, + "Rain on surface": 291, + "Raindrop": 290, + "Rapping": 36, + "Ratchet, pawl": 405, + "Rattle": 135, + "Rattle (instrument)": 175, + "Reggae": 228, + "Reverberation": 511, + "Reversing beeps": 319, + "Rhythm and blues": 226, + "Rimshot": 166, + "Ringtone": 391, + "Roar": 110, + "Roaring cats (lions, tigers)": 109, + "Rock and roll": 224, + "Rock music": 219, + "Rodents, rats, mice": 123, + "Roll": 477, + "Rowboat, canoe, kayak": 303, + "Rub": 476, + "Rumble": 493, + "Run": 51, + "Rustle": 487, + "Rustling leaves": 284, + "Sad music": 278, + "Sailboat, sailing ship": 302, + "Salsa music": 249, + "Sampler": 159, + "Sanding": 423, + "Sawing": 421, + "Saxophone": 197, + "Scary music": 282, + "Scissors": 381, + "Scrape": 475, + "Scratch": 474, + "Scratching (performance technique)": 215, + "Screaming": 14, + "Sewing machine": 411, + "Shatter": 443, + "Sheep": 97, + "Ship": 305, + "Shofar": 212, + "Shout": 8, + "Shuffle": 52, + "Shuffling cards": 383, + "Sidetone": 518, + "Sigh": 26, + "Silence": 500, + "Sine wave": 501, + "Singing": 27, + "Singing bowl": 214, + "Single-lens reflex camera": 417, + "Sink (filling or washing)": 371, + "Siren": 396, + "Sitar": 148, + "Sizzle": 490, + "Ska": 263, + "Skateboard": 342, + "Skidding": 312, + "Slam": 358, + "Slap, smack": 467, + "Sliding door": 357, + "Slosh": 446, + "Smash, crash": 469, + "Smoke detector, smoke alarm": 399, + "Snake": 134, + "Snare drum": 165, + "Sneeze": 49, + "Snicker": 19, + "Sniff": 50, + "Snoring": 43, + "Snort": 46, + "Sonar": 457, + "Song": 266, + "Soul music": 227, + "Sound effect": 504, + "Soundtrack music": 270, + "Speech": 0, + "Speech synthesizer": 7, + "Splash, splatter": 445, + "Splinter": 439, + "Spray": 453, + "Squawk": 114, + "Squeak": 361, + "Squeal": 485, + "Squish": 447, + "Static": 515, + "Steam": 296, + "Steam whistle": 403, + "Steel guitar, slide guitar": 144, + "Steelpan": 183, + "Stir": 455, + "Stomach rumble": 57, + "Stream": 292, + "String section": 190, + "Strum": 146, + "Subway, metro, underground": 334, + "Swing music": 230, + "Synthesizer": 158, + "Synthetic singing": 35, + "Tabla": 170, + "Tambourine": 174, + "Tap": 360, + "Tapping (guitar technique)": 145, + "Tearing": 480, + "Techno": 241, + "Telephone": 389, + "Telephone bell ringing": 390, + "Telephone dialing, DTMF": 392, + "Television": 524, + "Tender music": 279, + "Theme music": 268, + "Theremin": 213, + "Throat clearing": 48, + "Throbbing": 522, + "Thump, thud": 460, + "Thunder": 287, + "Thunderstorm": 286, + "Thunk": 461, + "Tick": 407, + "Tick-tock": 408, + "Timpani": 169, + "Tire squeal": 313, + "Toilet flush": 374, + "Tools": 418, + "Toot": 309, + "Toothbrush": 375, + "Traditional music": 264, + "Traffic noise, roadway noise": 327, + "Train": 329, + "Train horn": 331, + "Train wheels squealing": 333, + "Train whistle": 330, + "Trance music": 247, + "Trickle, dribble": 450, + "Trombone": 188, + "Truck": 316, + "Trumpet": 187, + "Tubular bells": 178, + "Tuning fork": 204, + "Turkey": 102, + "Typewriter": 385, + "Typing": 384, + "Ukulele": 151, + "Vacuum cleaner": 377, + "Vehicle": 300, + "Vehicle horn, car horn, honking": 308, + "Vibraphone": 182, + "Vibration": 523, + "Video game music": 272, + "Violin, fiddle": 191, + "Vocal music": 254, + "Wail, moan": 25, + "Walk, footsteps": 53, + "Water": 288, + "Water tap, faucet": 370, + "Waterfall": 293, + "Waves, surf": 295, + "Wedding music": 275, + "Whack, thwack": 468, + "Whale vocalization": 136, + "Wheeze": 42, + "Whimper": 24, + "Whimper (dog)": 80, + "Whip": 472, + "Whir": 488, + "Whispering": 15, + "Whistle": 402, + "Whistling": 40, + "White noise": 520, + "Whoop": 10, + "Whoosh, swoosh, swish": 459, + "Wild animals": 108, + "Wind": 283, + "Wind chime": 206, + "Wind instrument, woodwind instrument": 195, + "Wind noise (microphone)": 285, + "Wood": 437, + "Wood block": 173, + "Writing": 387, + "Yell": 11, + "Yip": 76, + "Yodeling": 29, + "Zing": 497, + "Zipper (clothing)": 378, + "Zither": 150 + }, + "layer_norm_eps": 1e-12, + "model_type": "audio-spectrogram-transformer", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "patch_size": 16, + "qkv_bias": true, + "time_dimension": 1024, + "time_stride": 10, + "torch_dtype": "float32", + "transformers_version": "4.25.0.dev0" +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..41490b4 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935626b2736983691f1b56215196329e052509f666c80263815aa87a41101d8d +size 346445611