Compare commits

...

10 Commits

Author SHA1 Message Date
harshit katyal 7fd191edd9 Update README.md 2021-12-12 20:53:33 +00:00
harshit katyal 696c23195e Update README.md 2021-12-12 20:48:46 +00:00
harshit katyal 4783952888 Update README.md 2021-12-12 20:47:36 +00:00
harshit katyal 2d4fa4b591 Update README.md 2021-12-12 20:46:20 +00:00
harshit katyal 2c372c0aa6 Update README.md 2021-12-12 20:43:57 +00:00
harshit katyal f7cac3045a Update README.md 2021-12-12 20:42:26 +00:00
harshit katyal ca917012b8 Upload pytorch_model.bin with git-lfs 2021-12-12 20:37:02 +00:00
harshit katyal 3d7b607ac1 Upload trainer_state.json 2021-12-12 20:27:11 +00:00
harshit katyal 683b5f31be Upload README.md 2021-12-12 20:26:43 +00:00
harshit katyal edc42333c6 Upload config.json 2021-12-12 20:22:48 +00:00
4 changed files with 262 additions and 0 deletions

85
README.md Normal file
View File

@ -0,0 +1,85 @@
---
language: en
datasets:
- aesdd
tags:
- audio
- audio-classification
- speech
license: apache-2.0
---
~~~
# requirement packages
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa
~~~
# prediction
~~~
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor
import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd
~~~
~~~
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "harshit345/xlsr-wav2vec-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
~~~
~~~
def speech_file_to_array_fn(path, sampling_rate):
speech_array, _sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
return speech
def predict(path, sampling_rate):
speech = speech_file_to_array_fn(path, sampling_rate)
inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits = model(**inputs).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
return outputs
~~~
# prediction
~~~
# path for a sample
path = '/data/jtes_v1.1/wav/f01/ang/f01_ang_01.wav'
outputs = predict(path, sampling_rate)
~~~
~~~
[{'Emotion': 'anger', 'Score': '78.3%'},
{'Emotion': 'disgust', 'Score': '11.7%'},
{'Emotion': 'fear', 'Score': '5.4%'},
{'Emotion': 'happiness', 'Score': '4.1%'},
{'Emotion': 'sadness', 'Score': '0.5%'}]
~~~
## Evaluation
The following tables summarize the scores obtained by model overall and per each class.
| Emotions | precision | recall | f1-score | accuracy |
|-----------|-----------|--------|----------|----------|
| anger | 0.82 | 1.00 | 0.81 | |
| disgust | 0.85 | 0.96 | 0.85 | |
| fear | 0.78 | 0.88 | 0.80 | |
| happiness | 0.84 | 0.71 | 0.78 | |
| sadness | 0.86 | 1.00 | 0.79 | |
| | | | Overall | 0.806 |
##
Colab Notebook
https://colab.research.google.com/drive/1aPPb_ZVS5dlFVZySly8Q80a44La1XjJu?usp=sharing

104
config.json Normal file
View File

@ -0,0 +1,104 @@
{
"_name_or_path": "lighteternal/wav2vec2-large-xlsr-53-greek",
"activation_dropout": 0.0,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForSpeechClassification"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 256,
"contrastive_logits_temperature": 0.1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "mean",
"ctc_zero_infinity": true,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.0,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.0,
"finetuning_task": "wav2vec2_clf",
"gradient_checkpointing": true,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 1024,
"id2label": {
"0": "anger",
"1": "disgust",
"2": "fear",
"3": "happiness",
"4": "sadness"
},
"initializer_range": 0.02,
"intermediate_size": 4096,
"label2id": {
"anger": 0,
"disgust": 1,
"fear": 2,
"happiness": 3,
"sadness": 4
},
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.05,
"mask_time_selection": "static",
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"pad_token_id": 54,
"pooling_mode": "mean",
"problem_type": "single_label_classification",
"proj_codevector_dim": 256,
"torch_dtype": "float32",
"transformers_version": "4.11.0.dev0",
"use_weighted_layer_sum": false,
"vocab_size": 55
}

BIN
pytorch_model.bin (Stored with Git LFS) Normal file

Binary file not shown.

70
trainer_state.json Normal file
View File

@ -0,0 +1,70 @@
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.991735537190083,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.66,
"learning_rate": 6.666666666666667e-05,
"loss": 1.3311,
"step": 100
},
{
"epoch": 1.66,
"eval_accuracy": 0.6033057570457458,
"eval_loss": 1.0739655494689941,
"eval_runtime": 74.6016,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.416,
"step": 100
},
{
"epoch": 3.33,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6235,
"step": 200
},
{
"epoch": 3.33,
"eval_accuracy": 0.8512396812438965,
"eval_loss": 0.47899967432022095,
"eval_runtime": 71.6672,
"eval_samples_per_second": 1.688,
"eval_steps_per_second": 0.433,
"step": 200
},
{
"epoch": 4.99,
"learning_rate": 0.0,
"loss": 0.2154,
"step": 300
},
{
"epoch": 4.99,
"eval_accuracy": 0.9008264541625977,
"eval_loss": 0.3769935369491577,
"eval_runtime": 73.3118,
"eval_samples_per_second": 1.65,
"eval_steps_per_second": 0.423,
"step": 300
},
{
"epoch": 4.99,
"step": 300,
"total_flos": 4.279570856908188e+17,
"train_loss": 0.7233316548665365,
"train_runtime": 4793.1545,
"train_samples_per_second": 0.504,
"train_steps_per_second": 0.063
}
],
"max_steps": 300,
"num_train_epochs": 5,
"total_flos": 4.279570856908188e+17,
"trial_name": null,
"trial_params": null
}