Update README.md

2021-12-12 20:53:33 +00:00 · 2021-12-12 20:48:46 +00:00 · 2021-12-12 20:47:36 +00:00 · 2021-12-12 20:46:20 +00:00 · 2021-12-12 20:43:57 +00:00 · 2021-12-12 20:42:26 +00:00
4 changed files with 262 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,85 @@
+---
+language: en
+datasets:
+- aesdd
+tags:
+- audio
+- audio-classification
+- speech
+license: apache-2.0
+---
+~~~
+# requirement packages
+!pip install git+https://github.com/huggingface/datasets.git
+!pip install git+https://github.com/huggingface/transformers.git
+!pip install torchaudio
+!pip install librosa
+
+~~~
+# prediction
+~~~
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from transformers import AutoConfig, Wav2Vec2FeatureExtractor
+import librosa
+import IPython.display as ipd
+import numpy as np
+import pandas as pd
+~~~
+~~~
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_name_or_path = "harshit345/xlsr-wav2vec-speech-emotion-recognition"
+config = AutoConfig.from_pretrained(model_name_or_path)
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
+sampling_rate = feature_extractor.sampling_rate
+model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
+~~~
+~~~
+def speech_file_to_array_fn(path, sampling_rate):
+    speech_array, _sampling_rate = torchaudio.load(path)
+    resampler = torchaudio.transforms.Resample(_sampling_rate)
+    speech = resampler(speech_array).squeeze().numpy()
+    return speech
+def predict(path, sampling_rate):
+    speech = speech_file_to_array_fn(path, sampling_rate)
+    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+    inputs = {key: inputs[key].to(device) for key in inputs}
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
+    return outputs
+~~~
+# prediction
+~~~
+# path for a sample
+path = '/data/jtes_v1.1/wav/f01/ang/f01_ang_01.wav'   
+outputs = predict(path, sampling_rate)
+~~~
+~~~
+[{'Emotion': 'anger', 'Score': '78.3%'},
+ {'Emotion': 'disgust', 'Score': '11.7%'},
+ {'Emotion': 'fear', 'Score': '5.4%'},
+ {'Emotion': 'happiness', 'Score': '4.1%'},
+ {'Emotion': 'sadness', 'Score': '0.5%'}]
+ ~~~
+ 
+ ## Evaluation
+The following tables summarize the scores obtained by model overall and per each class.
+
+
+| Emotions  | precision | recall | f1-score | accuracy |
+|-----------|-----------|--------|----------|----------|
+| anger     | 0.82      | 1.00   | 0.81     |          |
+| disgust   | 0.85      | 0.96   | 0.85     |          |
+| fear      | 0.78      | 0.88   | 0.80     |          |
+| happiness | 0.84      | 0.71   | 0.78     |          |
+| sadness   | 0.86      | 1.00   | 0.79     |          |
+|           |           |        | Overall  | 0.806    |
+
+
+## 
+Colab Notebook
+https://colab.research.google.com/drive/1aPPb_ZVS5dlFVZySly8Q80a44La1XjJu?usp=sharing
--- a/config.json
+++ b/config.json
@ -0,0 +1,104 @@
+{
+  "_name_or_path": "lighteternal/wav2vec2-large-xlsr-53-greek",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForSpeechClassification"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": true,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "finetuning_task": "wav2vec2_clf",
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "anger",
+    "1": "disgust",
+    "2": "fear",
+    "3": "happiness",
+    "4": "sadness"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "anger": 0,
+    "disgust": 1,
+    "fear": 2,
+    "happiness": 3,
+    "sadness": 4
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "pad_token_id": 54,
+  "pooling_mode": "mean",
+  "problem_type": "single_label_classification",
+  "proj_codevector_dim": 256,
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 55
+}
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
--- a/trainer_state.json
+++ b/trainer_state.json
@ -0,0 +1,70 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.991735537190083,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.66,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.3311,
+      "step": 100
+    },
+    {
+      "epoch": 1.66,
+      "eval_accuracy": 0.6033057570457458,
+      "eval_loss": 1.0739655494689941,
+      "eval_runtime": 74.6016,
+      "eval_samples_per_second": 1.622,
+      "eval_steps_per_second": 0.416,
+      "step": 100
+    },
+    {
+      "epoch": 3.33,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.6235,
+      "step": 200
+    },
+    {
+      "epoch": 3.33,
+      "eval_accuracy": 0.8512396812438965,
+      "eval_loss": 0.47899967432022095,
+      "eval_runtime": 71.6672,
+      "eval_samples_per_second": 1.688,
+      "eval_steps_per_second": 0.433,
+      "step": 200
+    },
+    {
+      "epoch": 4.99,
+      "learning_rate": 0.0,
+      "loss": 0.2154,
+      "step": 300
+    },
+    {
+      "epoch": 4.99,
+      "eval_accuracy": 0.9008264541625977,
+      "eval_loss": 0.3769935369491577,
+      "eval_runtime": 73.3118,
+      "eval_samples_per_second": 1.65,
+      "eval_steps_per_second": 0.423,
+      "step": 300
+    },
+    {
+      "epoch": 4.99,
+      "step": 300,
+      "total_flos": 4.279570856908188e+17,
+      "train_loss": 0.7233316548665365,
+      "train_runtime": 4793.1545,
+      "train_samples_per_second": 0.504,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "max_steps": 300,
+  "num_train_epochs": 5,
+  "total_flos": 4.279570856908188e+17,
+  "trial_name": null,
+  "trial_params": null
+}
Author	SHA1	Message	Date
harshit katyal	7fd191edd9	Update README.md	2021-12-12 20:53:33 +00:00
harshit katyal	696c23195e	Update README.md	2021-12-12 20:48:46 +00:00
harshit katyal	4783952888	Update README.md	2021-12-12 20:47:36 +00:00
harshit katyal	2d4fa4b591	Update README.md	2021-12-12 20:46:20 +00:00
harshit katyal	2c372c0aa6	Update README.md	2021-12-12 20:43:57 +00:00
harshit katyal	f7cac3045a	Update README.md	2021-12-12 20:42:26 +00:00
harshit katyal	ca917012b8	Upload pytorch_model.bin with git-lfs	2021-12-12 20:37:02 +00:00
harshit katyal	3d7b607ac1	Upload trainer_state.json	2021-12-12 20:27:11 +00:00
harshit katyal	683b5f31be	Upload README.md	2021-12-12 20:26:43 +00:00
harshit katyal	edc42333c6	Upload config.json	2021-12-12 20:22:48 +00:00