Update README.md

This commit is contained in:
harshit katyal 2021-12-12 20:42:26 +00:00 committed by huggingface-web
parent ca917012b8
commit f7cac3045a
1 changed files with 48 additions and 40 deletions

View File

@ -1,81 +1,89 @@
--- ---
language: el language: en
datasets: datasets: Toronto emotional speech set (TESS)(https://www.kaggle.com/ejlok1/toronto-emotional-speech-set-tess)
- aesdd
tags: tags:
- audio - audio
- audio-classification - automatic-speech-recognition
- speech - speech
- speech-emotion-recognition
license: apache-2.0 license: apache-2.0
--- ---
# Emotion Recognition in Speech using Wav2Vec 2.0
## How to use
~~~ ### Requirements
```bash
# requirement packages # requirement packages
!pip install git+https://github.com/huggingface/datasets.git !pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git !pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio !pip install torchaudio
!pip install librosa !pip install librosa
!git clone https://github.com/m3hrdadfi/soxan ```
cd soxan ### Prediction
~~~ ```python
# prediction
~~~
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torchaudio import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor from transformers import AutoConfig, Wav2Vec2FeatureExtractor
import librosa import librosa
import IPython.display as ipd import IPython.display as ipd
import numpy as np import numpy as np
import pandas as pd import pandas as pd
~~~ ```
```python
~~~
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "Bagus/wav2vec2-xlsr-greek-speech-emotion-recognition" model_name_or_path = "harshit345/xlsr-wav2vec-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path) config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device) model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
~~~ ```
```python
~~~
def speech_file_to_array_fn(path, sampling_rate): def speech_file_to_array_fn(path, sampling_rate):
speech_array, _sampling_rate = torchaudio.load(path) speech_array, _sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(_sampling_rate) resampler = torchaudio.transforms.Resample(_sampling_rate)
speech = resampler(speech_array).squeeze().numpy() speech = resampler(speech_array).squeeze().numpy()
return speech return speech
def predict(path, sampling_rate): def predict(path, sampling_rate):
speech = speech_file_to_array_fn(path, sampling_rate) speech = speech_file_to_array_fn(path, sampling_rate)
inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True) inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs} inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad(): with torch.no_grad():
logits = model(**inputs).logits logits = model(**inputs).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0] scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)] outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
return outputs return outputs
~~~ ```
```python
# prediction path = "/path/to/disgust.wav"
~~~
# path for a sample
path = '/data/jtes_v1.1/wav/f01/ang/f01_ang_01.wav'
outputs = predict(path, sampling_rate) outputs = predict(path, sampling_rate)
~~~ ```
```bash
[
{'Emotion': 'anger', 'Score': '12.2%'},
{'Emotion': 'disgust', 'Score': '78.8%'},
{'Emotion': 'fear', 'Score': '7.2%'},
{'Emotion': 'happiness', 'Score': '1.3%'},
{'Emotion': 'sadness', 'Score': '1.5%'}
]
```
~~~
[{'Emotion': 'anger', 'Score': '98.3%'}, ## Evaluation
{'Emotion': 'disgust', 'Score': '0.0%'}, The following tables summarize the scores obtained by model overall and per each class.
{'Emotion': 'fear', 'Score': '0.4%'},
{'Emotion': 'happiness', 'Score': '0.7%'},
{'Emotion': 'sadness', 'Score': '0.5%'}] | Emotions | precision | recall | f1-score | accuracy |
~~~ |-----------|-----------|--------|----------|----------|
| anger | 0.82 | 1.00 | 0.81 | |
| disgust | 0.85 | 0.96 | 0.85 | |
| fear | 0.78 | 0.88 | 0.80 | |
| happiness | 0.84 | 0.71 | 0.78 | |
| sadness | 0.86 | 1.00 | 0.79 | |
| | | | Overall | 0.806 |
##
Colab Notebook
https://colab.research.google.com/drive/1aPPb_ZVS5dlFVZySly8Q80a44La1XjJu?usp=sharing